commit 410c20d4fa03fde758b837883e0b3a5bc872314f
Author: hailin <hailin.zhao@gdzx.xyz>
Date:   Thu Mar 5 22:53:15 2026 -0800

    feat: init my-sora — merge Open-Sora v2.0 (11B) + v1.3 data pipeline tools

diff --git a/.github/workflows/close_issue.yaml b/.github/workflows/close_issue.yaml
new file mode 100644
index 0000000..3b6e24c
--- /dev/null
+++ b/.github/workflows/close_issue.yaml
@@ -0,0 +1,22 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 7
+          days-before-issue-close: 7
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 7 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/github_page.yaml b/.github/workflows/github_page.yaml
new file mode 100644
index 0000000..483c2ad
--- /dev/null
+++ b/.github/workflows/github_page.yaml
@@ -0,0 +1,30 @@
+name: GitHub Pages
+
+on:
+  workflow_dispatch:
+
+jobs:
+  deploy:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: gallery
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - run: npm install
+      - run: npm run build
+
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./build
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b0b62a0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,198 @@
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.vscode/
+
+# macos
+*.DS_Store
+
+# misc files
+data/
+dataset/
+runs/
+checkpoints/
+outputs/
+outputs
+samples/
+samples
+logs/
+pretrained_models/
+pretrained_models
+evaluation_results/
+cache/
+*.swp
+debug/
+
+# Secret files
+hostfiles/
+hostfile*
+run.sh
+gradio_cached_examples/
+wandb/
+
+# npm
+node_modules/
+package-lock.json
+package.json
+
+exps
+ckpts
+flash-attention
+datasets
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..bce9cb8
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,31 @@
+repos:
+
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.1
+    hooks:
+      - id: autoflake
+        name: autoflake (python)
+        args: ['--in-place']
+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: sort all imports (python)
+
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.9.1
+    hooks:
+    - id: black
+      name: black formatter
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-merge-conflict
+      - id: check-case-conflict
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ['--fix=lf']
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..2acbec4
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,100 @@
+# Contributing
+
+The Open-Sora project welcomes any constructive contribution from the community and the team is more than willing to work on problems you have encountered to make it a better project.
+
+## Development Environment Setup
+
+To contribute to Open-Sora, we would like to first guide you to set up a proper development environment so that you can better implement your code. You can install this library from source with the `editable` flag (`-e`, for development mode) so that your change to the source code will be reflected in runtime without re-installation.
+
+You can refer to the [Installation Section](./README.md#installation) and replace `pip install -v .` with `pip install -v -e .`.
+
+### Code Style
+
+We have some static checks when you commit your code change, please make sure you can pass all the tests and make sure the coding style meets our requirements. We use pre-commit hook to make sure the code is aligned with the writing standard. To set up the code style checking, you need to follow the steps below.
+
+```shell
+# these commands are executed under the Open-Sora directory
+pip install pre-commit
+pre-commit install
+```
+
+Code format checking will be automatically executed when you commit your changes.
+
+## Contribution Guide
+
+You need to follow these steps below to make contribution to the main repository via pull request. You can learn about the details of pull request [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests).
+
+### 1. Fork the Official Repository
+
+Firstly, you need to visit the [Open-Sora repository](https://github.com/hpcaitech/Open-Sora) and fork into your own account. The `fork` button is at the right top corner of the web page alongside with buttons such as `watch` and `star`.
+
+Now, you can clone your own forked repository into your local environment.
+
+```shell
+git clone https://github.com/<YOUR-USERNAME>/Open-Sora.git
+```
+
+### 2. Configure Git
+
+You need to set the official repository as your upstream so that you can synchronize with the latest update in the official repository. You can learn about upstream [here](https://www.atlassian.com/git/tutorials/git-forks-and-upstreams).
+
+Then add the original repository as upstream
+
+```shell
+cd Open-Sora
+git remote add upstream https://github.com/hpcaitech/Open-Sora.git
+```
+
+you can use the following command to verify that the remote is set. You should see both `origin` and `upstream` in the output.
+
+```shell
+git remote -v
+```
+
+### 3. Synchronize with Official Repository
+
+Before you make changes to the codebase, it is always good to fetch the latest updates in the official repository. In order to do so, you can use the commands below.
+
+```shell
+git fetch upstream
+git checkout main
+git merge upstream/main
+git push origin main
+```
+
+### 5. Create a New Branch
+
+You should not make changes to the `main` branch of your forked repository as this might make upstream synchronization difficult. You can create a new branch with the appropriate name. General branch name format should start with `hotfix/` and `feature/`. `hotfix` is for bug fix and `feature` is for addition of a new feature.
+
+```shell
+git checkout -b <NEW-BRANCH-NAME>
+```
+
+### 6. Implementation and Code Commit
+
+Now you can implement your code change in the source code. Remember that you installed the system in development, thus you do not need to uninstall and install to make the code take effect. The code change will be reflected in every new PyThon execution.
+You can commit and push the changes to your local repository. The changes should be kept logical, modular and atomic.
+
+```shell
+git add -A
+git commit -m "<COMMIT-MESSAGE>"
+git push -u origin <NEW-BRANCH-NAME>
+```
+
+### 7. Open a Pull Request
+
+You can now create a pull request on the GitHub webpage of your repository. The source branch is `<NEW-BRANCH-NAME>` of your repository and the target branch should be `main` of `hpcaitech/Open-Sora`. After creating this pull request, you should be able to see it [here](https://github.com/hpcaitech/Open-Sora/pulls).
+
+The Open-Sora team will review your code change and merge your code if applicable.
+
+## FQA
+
+1. `pylint` cannot recognize some members:
+
+Add this into your `settings.json` in VSCode:
+
+```json
+"pylint.args": [
+ "--generated-members=numpy.* ,torch.*,cv2.*",
+],
+```
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..71b9a62
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,26 @@
+FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0
+
+# metainformation
+LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0"
+
+# inatall library dependencies
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+
+# install flash attention
+RUN pip install flash-attn --no-build-isolation
+
+# install apex
+RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
+
+# install xformers
+RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121
+
+# Set the working directory
+WORKDIR /workspace/Open-Sora
+# Copy the current directory contents into the container at /workspace/Open-Sora
+COPY . .
+
+# install this project
+RUN pip install -v .
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e4edb5f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,350 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2024 HPC-AI Technology Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   =========================================================================
+   This project is inspired by the listed projects and is subject to the following licenses:
+
+   10. [T5: Text-To-Text Transfer Transformer](https://github.com/google-research/text-to-text-transfer-transformer)
+
+   Copyright 2019 Google
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   11. [CLIP](https://github.com/openai/CLIP/tree/main)
+
+   MIT License
+
+   Copyright (c) 2021 OpenAI
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   12. [FLUX](https://github.com/black-forest-labs/flux)
+
+   Copyright 2024 Black Forest Labs
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   13. [EfficientViT](https://github.com/mit-han-lab/efficientvit)
+
+   Copyright [2023] [Han Cai]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   14. [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/tree/main)
+
+   TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
+   Tencent HunyuanVideo Release Date: December 3, 2024
+   THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
+   By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+
+   1. DEFINITIONS.
+   a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
+   b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
+   c. “Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
+   d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
+   e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
+   f. “Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
+   g. “Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
+   h. “Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
+   i. “Tencent,” “We” or “Us” shall mean THL A29 Limited.
+   j. “Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanVideo released at [https://github.com/Tencent/HunyuanVideo].
+   k. “Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
+   l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
+   m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
+   n. “including” shall mean including but not limited to.
+   2. GRANT OF RIGHTS.
+   We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
+   3. DISTRIBUTION.
+   You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
+   a. You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
+   b. You must cause any modified files to carry prominent notices stating that You changed the files;
+   c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
+   d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2024 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
+   You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
+   4. ADDITIONAL COMMERCIAL TERMS.
+   If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
+   5. RULES OF USE.
+   a. Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
+   b. You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
+   c. You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
+   6. INTELLECTUAL PROPERTY.
+   a. Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
+   b. No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
+   c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
+   d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
+   7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
+   a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
+   b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+   c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+   8. SURVIVAL AND TERMINATION.
+   a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+   b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
+   9. GOVERNING LAW AND JURISDICTION.
+   a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+   b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
+
+   EXHIBIT A
+   ACCEPTABLE USE POLICY
+
+   Tencent reserves the right to update this Acceptable Use Policy from time to time.
+   Last modified: November 5, 2024
+
+   Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
+
+   1. Outside the Territory;
+   2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
+   3. To harm Yourself or others;
+   4. To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
+   5. To override or circumvent the safety guardrails and safeguards We have put in place;
+   6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+   7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
+   8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
+   9. To intentionally defame, disparage or otherwise harass others;
+   10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
+   11. To generate or disseminate personal identifiable information with the purpose of harming others;
+   12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
+   13. To impersonate another individual without consent, authorization, or legal right;
+   14. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
+   15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
+   16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
+   17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
+   18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+   19. For military purposes;
+   20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..72b6e8d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,349 @@
+<p align="center">
+    <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/icon.png" width="250"/>
+</p>
+<div align="center">
+    <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
+    <a href="https://arxiv.org/abs/2503.09642v1"><img src="https://img.shields.io/static/v1?label=Tech Report 2.0&message=Arxiv&color=red"></a>
+    <a href="https://arxiv.org/abs/2412.20404"><img src="https://img.shields.io/static/v1?label=Tech Report 1.2&message=Arxiv&color=red"></a>
+    <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
+</div>
+
+<div align="center">
+    <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
+    <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
+    <a href="https://x.com/YangYou1991/status/1899973689460044010"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
+    <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
+</div>
+
+## Open-Sora: Democratizing Efficient Video Production for All
+
+We design and implement **Open-Sora**, an initiative dedicated to **efficiently** producing high-quality video. We hope to make the model,
+tools and all details accessible to all. By embracing **open-source** principles,
+Open-Sora not only democratizes access to advanced video generation techniques, but also offers a
+streamlined and user-friendly platform that simplifies the complexities of video generation.
+With Open-Sora, our goal is to foster innovation, creativity, and inclusivity within the field of content creation.
+
+🎬 For a professional AI video-generation product, try [Video Ocean](https://video-ocean.com/) — powered by a superior model.
+<div align="center">
+   <a href="https://video-ocean.com/">
+   <img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/3.gif" width="850" />
+   </a>
+</div>
+
+<div align="center">
+   <a href="https://hpc-ai.com/?utm_source=github&utm_medium=social&utm_campaign=promotion-opensora">
+   <img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/1.gif" width="850" />
+   </a>
+</div>
+
+<!-- [[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)] -->
+
+## 📰 News
+
+- **[2025.03.12]** 🔥 We released **Open-Sora 2.0** (11B). 🎬 11B model achieves [on-par performance](#evaluation) with 11B HunyuanVideo & 30B Step-Video on 📐VBench & 📊Human Preference. 🛠️ Fully open-source: checkpoints and training codes for training with only **$200K**. [[report]](https://arxiv.org/abs/2503.09642v1)
+- **[2025.02.20]** 🔥 We released **Open-Sora 1.3** (1B). With the upgraded VAE and Transformer architecture, the quality of our generated videos has been greatly improved 🚀. [[checkpoints]](#open-sora-13-model-weights) [[report]](/docs/report_04.md) [[demo]](https://huggingface.co/spaces/hpcai-tech/open-sora)
+- **[2024.12.23]** The development cost of video generation models has saved by 50%! Open-source solutions are now available with H200 GPU vouchers. [[blog]](https://company.hpc-ai.com/blog/the-development-cost-of-video-generation-models-has-saved-by-50-open-source-solutions-are-now-available-with-h200-gpu-vouchers) [[code]](https://github.com/hpcaitech/Open-Sora/blob/main/scripts/train.py) [[vouchers]](https://colossalai.org/zh-Hans/docs/get_started/bonus/)
+- **[2024.06.17]** We released **Open-Sora 1.2**, which includes **3D-VAE**, **rectified flow**, and **score condition**. The video quality is greatly improved. [[checkpoints]](#open-sora-12-model-weights) [[report]](/docs/report_03.md) [[arxiv]](https://arxiv.org/abs/2412.20404)
+- **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces.
+- **[2024.04.25]** We released **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]](#open-sora-11-model-weights) [[report]](/docs/report_02.md)
+- **[2024.03.18]** We released **Open-Sora 1.0**, a fully open-source project for video generation.
+  Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with
+  <a href="https://github.com/hpcaitech/ColossalAI"><img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/colossal_ai.png" width="8%" ></a>
+  acceleration,
+  inference, and more. Our model can produce 2s 512x512 videos with only 3 days training. [[checkpoints]](#open-sora-10-model-weights)
+  [[blog]](https://hpc-ai.com/blog/open-sora-v1.0) [[report]](/docs/report_01.md)
+- **[2024.03.04]** Open-Sora provides training with 46% cost reduction.
+  [[blog]](https://hpc-ai.com/blog/open-sora)
+
+📍 Since Open-Sora is under active development, we remain different branches for different versions. The latest version is [main](https://github.com/hpcaitech/Open-Sora). Old versions include: [v1.0](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.0), [v1.1](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.1), [v1.2](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.2), [v1.3](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.3).
+
+## 🎥 Latest Demo
+
+Demos are presented in compressed GIF format for convenience. For original quality samples and their corresponding prompts, please visit our [Gallery](https://hpcaitech.github.io/Open-Sora/).
+
+| **5s 1024×576**                                                                                                                                    | **5s 576×1024**                                                                                                                                    | **5s 576×1024**                                                                                                                                   |
+| -------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/ft_0001_1_1.gif" width="">](https://streamable.com/e/8g9y9h?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0160.gif" width="">](https://streamable.com/e/k50mnv?autoplay=1)  | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0017.gif" width="">](https://streamable.com/e/bzrn9n?autoplay=1) |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/ft_0012_1_1.gif" width="">](https://streamable.com/e/dsv8da?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/douyin_0005.gif" width="">](https://streamable.com/e/3wif07?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0037.gif" width="">](https://streamable.com/e/us2w7h?autoplay=1) |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/ft_0055_1_1.gif" width="">](https://streamable.com/e/yfwk8i?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/sora_0019.gif" width="">](https://streamable.com/e/jgjil0?autoplay=1)   | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0463.gif" width="">](https://streamable.com/e/lsoai1?autoplay=1) |
+
+<details>
+<summary>OpenSora 1.3 Demo</summary>
+
+| **5s 720×1280**                                                                                                                                                        | **5s 720×1280**                                                                                                                                                           | **5s 720×1280**                                                                                                                                                              |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_tomato.gif" width="">](https://streamable.com/e/r0imrp?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_fisherman.gif" width="">](https://streamable.com/e/hfvjkh?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_girl2.gif" width="">](https://streamable.com/e/kutmma?quality=highest&amp;autoplay=1)        |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_grape.gif" width="">](https://streamable.com/e/osn1la?quality=highest&amp;autoplay=1)  | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_mushroom.gif" width="">](https://streamable.com/e/l1pzws?quality=highest&amp;autoplay=1)  | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_parrot.gif" width="">](https://streamable.com/e/2vqari?quality=highest&amp;autoplay=1)       |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_trans.gif" width="">](https://streamable.com/e/1in7d6?quality=highest&amp;autoplay=1)  | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_bear.gif" width="">](https://streamable.com/e/e9bi4o?quality=highest&amp;autoplay=1)      | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_futureflower.gif" width="">](https://streamable.com/e/09z7xi?quality=highest&amp;autoplay=1) |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_fire.gif" width="">](https://streamable.com/e/16c3hk?quality=highest&amp;autoplay=1)   | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_man.gif" width="">](https://streamable.com/e/wi250w?quality=highest&amp;autoplay=1)       | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_black.gif" width="">](https://streamable.com/e/vw5b64?quality=highest&amp;autoplay=1)        |
+
+</details>
+
+<details>
+<summary>OpenSora 1.2 Demo</summary>
+
+| **4s 720×1280**                                                                                                                                                                                     | **4s 720×1280**                                                                                                                                                                                     | **4s 720×1280**                                                                                                                                                                                     |
+| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0013.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_1718.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0087.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0052.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_1719.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0002.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0011.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0004.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0061.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |
+
+</details>
+
+<details>
+<summary>OpenSora 1.1 Demo</summary>
+
+| **2s 240×426**                                                                                                                                                                                                  | **2s 240×426**                                                                                                                                                                                                 |
+| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)  | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |
+
+| **2s 426×240**                                                                                                                                                                                                 | **4s 480×854**                                                                                                                                                                                                  |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |
+
+| **16s 320×320**                                                                                                                                                                                            | **16s 224×448**                                                                                                                                                                                            | **2s 426×240**                                                                                                                                                                                                |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
+
+</details>
+
+<details>
+<summary>OpenSora 1.0 Demo</summary>
+
+| **2s 512×512**                                                                                                                                                                                   | **2s 512×512**                                                                                                                                                                                   | **2s 512×512**                                                                                                                                                                                   |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) |
+| A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop.                   | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff.                      | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall.                                                |
+| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) |
+| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...]                                                                             | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...]                                                                 | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...]                                                                  |
+
+Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display,
+see [here](/assets/texts/t2v_samples.txt) for full prompts.
+
+</details>
+
+## 🔆 Reports
+
+- **[Tech Report of Open-Sora 2.0](https://arxiv.org/abs/2503.09642v1)**
+- **[Step by step to train or finetune your own model](docs/train.md)**
+- **[Step by step to train and evaluate an video autoencoder](docs/ae.md)**
+- **[Visit the high compression video autoencoder](docs/hcae.md)**
+- Reports of previous version (better see in according branch):
+  - [Open-Sora 1.3](docs/report_04.md): shift-window attention, unified spatial-temporal VAE, etc.
+  - [Open-Sora 1.2](docs/report_03.md), [Tech Report](https://arxiv.org/abs/2412.20404): rectified flow, 3d-VAE, score condition, evaluation, etc.
+  - [Open-Sora 1.1](docs/report_02.md): multi-resolution/length/aspect-ratio, image/video conditioning/editing, data preprocessing, etc.
+  - [Open-Sora 1.0](docs/report_01.md): architecture, captioning, etc.
+
+📍 Since Open-Sora is under active development, we remain different branches for different versions. The latest version is [main](https://github.com/hpcaitech/Open-Sora). Old versions include: [v1.0](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.0), [v1.1](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.1), [v1.2](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.2), [v1.3](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.3).
+
+## Quickstart
+
+### Installation
+
+```bash
+# create a virtual env and activate (conda as an example)
+conda create -n opensora python=3.10
+conda activate opensora
+
+# download the repo
+git clone https://github.com/hpcaitech/Open-Sora
+cd Open-Sora
+
+# Ensure torch >= 2.4.0
+pip install -v . # for development mode, `pip install -v -e .`
+pip install xformers==0.0.27.post2 --index-url https://download.pytorch.org/whl/cu121 # install xformers according to your cuda version
+pip install flash-attn --no-build-isolation
+```
+
+Optionally, you can install flash attention 3 for faster speed.
+
+```bash
+git clone https://github.com/Dao-AILab/flash-attention # 4f0640d5
+cd flash-attention/hopper
+python setup.py install
+```
+
+### Model Download
+
+Our 11B model supports 256px and 768px resolution. Both T2V and I2V are supported by one model. 🤗 [Huggingface](https://huggingface.co/hpcai-tech/Open-Sora-v2) 🤖 [ModelScope](https://modelscope.cn/models/luchentech/Open-Sora-v2).
+
+Download from huggingface:
+
+```bash
+pip install "huggingface_hub[cli]"
+huggingface-cli download hpcai-tech/Open-Sora-v2 --local-dir ./ckpts
+```
+
+Download from ModelScope:
+
+```bash
+pip install modelscope
+modelscope download hpcai-tech/Open-Sora-v2 --local_dir ./ckpts
+```
+
+### Text-to-Video Generation
+
+Our model is optimized for image-to-video generation, but it can also be used for text-to-video generation. To generate high quality videos, with the help of flux text-to-image model, we build a text-to-image-to-video pipeline. For 256x256 resolution:
+
+```bash
+# Generate one given prompt
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea"
+
+# Save memory with offloading
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --offload True
+
+# Generation with csv
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --dataset.data-path assets/texts/example.csv
+```
+
+For 768x768 resolution:
+
+```bash
+# One GPU
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_768px.py --save-dir samples --prompt "raining, sea"
+
+# Multi-GPU with colossalai sp
+torchrun --nproc_per_node 8 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_768px.py --save-dir samples --prompt "raining, sea"
+```
+
+You can adjust the generation aspect ratio by `--aspect_ratio` and the generation length by `--num_frames`. Candidate values for aspect_ratio includes `16:9`, `9:16`, `1:1`, `2.39:1`. Candidate values for num_frames should be `4k+1` and less than 129.
+
+You can also run direct text-to-video by:
+
+```bash
+# One GPU for 256px
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/256px.py --prompt "raining, sea"
+# Multi-GPU for 768px
+torchrun --nproc_per_node 8 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px.py --prompt "raining, sea"
+```
+
+### Image-to-Video Generation
+
+Given a prompt and a reference image, you can generate a video with the following command:
+
+```bash
+# 256px
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/256px.py --cond_type i2v_head --prompt "A plump pig wallows in a muddy pond on a rustic farm, its pink snout poking out as it snorts contentedly. The camera captures the pig's playful splashes, sending ripples through the water under the midday sun. Wooden fences and a red barn stand in the background, framed by rolling green hills. The pig's muddy coat glistens in the sunlight, showcasing the simple pleasures of its carefree life." --ref assets/texts/i2v.png
+
+# 256px with csv
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/256px.py --cond_type i2v_head --dataset.data-path assets/texts/i2v.csv
+
+# Multi-GPU 768px
+torchrun --nproc_per_node 8 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px.py --cond_type i2v_head --dataset.data-path assets/texts/i2v.csv
+```
+
+## Advanced Usage
+
+### Motion Score
+
+During training, we provide motion score into the text prompt. During inference, you can use the following command to generate videos with motion score (the default score is 4):
+
+```bash
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --motion-score 4
+```
+
+We also provide a dynamic motion score evaluator. After setting your OpenAI API key, you can use the following command to evaluate the motion score of a video:
+
+```bash
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --motion-score dynamic
+```
+
+| Score | 1                                                                                                       | 4                                                                                                       | 7                                                                                                       |
+| ----- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------- |
+|       | <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/motion_score_1.gif" width=""> | <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/motion_score_4.gif" width=""> | <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/motion_score_7.gif" width=""> |
+
+### Prompt Refine
+
+We take advantage of ChatGPT to refine the prompt. You can use the following command to refine the prompt. The function is available for both text-to-video and image-to-video generation.
+
+```bash
+export OPENAI_API_KEY=sk-xxxx
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
+```
+
+### Reproductivity
+
+To make the results reproducible, you can set the random seed by:
+
+```bash
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --sampling_option.seed 42 --seed 42
+```
+
+Use `--num-sample k` to generate `k` samples for each prompt.
+
+## Computational Efficiency
+
+We test the computational efficiency of text-to-video on H100/H800 GPU. For 256x256, we use colossalai's tensor parallelism, and `--offload True` is used. For 768x768, we use colossalai's sequence parallelism. All use number of steps 50. The results are presented in the format: $\color{blue}{\text{Total time (s)}}/\color{red}{\text{peak GPU memory (GB)}}$
+
+| Resolution | 1x GPU                                 | 2x GPUs                               | 4x GPUs                               | 8x GPUs                               |
+| ---------- | -------------------------------------- | ------------------------------------- | ------------------------------------- | ------------------------------------- |
+| 256x256    | $\color{blue}{60}/\color{red}{52.5}$   | $\color{blue}{40}/\color{red}{44.3}$  | $\color{blue}{34}/\color{red}{44.3}$  |                                       |
+| 768x768    | $\color{blue}{1656}/\color{red}{60.3}$ | $\color{blue}{863}/\color{red}{48.3}$ | $\color{blue}{466}/\color{red}{44.3}$ | $\color{blue}{276}/\color{red}{44.3}$ |
+
+## Evaluation
+
+On [VBench](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard), Open-Sora 2.0 significantly narrows the gap with OpenAI’s Sora, reducing it from 4.52% → 0.69% compared to Open-Sora 1.2.
+
+![VBench](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/v2_vbench.png)
+
+Human preference results show our model is on par with HunyuanVideo 11B and Step-Video 30B.
+
+![Win Rate](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/v2_winrate.png)
+
+With strong performance, Open-Sora 2.0 is cost-effective.
+
+![Cost](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/v2_cost.png)
+
+## Contribution
+
+Thanks goes to these wonderful contributors:
+
+<a href="https://github.com/hpcaitech/Open-Sora/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=hpcaitech/Open-Sora" />
+</a>
+
+If you wish to contribute to this project, please refer to the [Contribution Guideline](./CONTRIBUTING.md).
+
+## Acknowledgement
+
+Here we only list a few of the projects. For other works and datasets, please refer to our report.
+
+- [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization
+  system.
+- [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
+- [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration
+  strategies for training progress from OpenDiT.
+- [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model.
+- [Flux](https://github.com/black-forest-labs/flux): A powerful text-to-image generation model.
+- [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video.
+- [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/tree/main?tab=readme-ov-file): Open-Source text-to-video model.
+- [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
+- [DC-AE](https://github.com/mit-han-lab/efficientvit): Deep Compression AutoEncoder for image compression.
+- [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
+- [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
+- [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
+- [PLLaVA](https://github.com/magic-research/PLLaVA): A powerful video captioning model.
+- [MiraData](https://github.com/mira-space/MiraData): A large-scale video dataset with long durations and structured caption.
+
+## Citation
+
+```bibtex
+@article{opensora,
+  title={Open-sora: Democratizing efficient video production for all},
+  author={Zheng, Zangwei and Peng, Xiangyu and Yang, Tianji and Shen, Chenhui and Li, Shenggui and Liu, Hongxin and Zhou, Yukun and Li, Tianyi and You, Yang},
+  journal={arXiv preprint arXiv:2412.20404},
+  year={2024}
+}
+
+@article{opensora2,
+    title={Open-Sora 2.0: Training a Commercial-Level Video Generation Model in $200k}, 
+    author={Xiangyu Peng and Zangwei Zheng and Chenhui Shen and Tom Young and Xinying Guo and Binluo Wang and Hang Xu and Hongxin Liu and Mingyan Jiang and Wenjun Li and Yuhui Wang and Anbang Ye and Gang Ren and Qianran Ma and Wanying Liang and Xiang Lian and Xiwen Wu and Yuting Zhong and Zhuangyan Li and Chaoyu Gong and Guojun Lei and Leijun Cheng and Limin Zhang and Minghao Li and Ruijie Zhang and Silan Hu and Shijie Huang and Xiaokang Wang and Yuanheng Zhao and Yuqi Wang and Ziang Wei and Yang You},
+    year={2025},
+    journal={arXiv preprint arXiv:2503.09642},
+}
+```
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date)
diff --git a/assets/texts/example.csv b/assets/texts/example.csv
new file mode 100644
index 0000000..97daf3e
--- /dev/null
+++ b/assets/texts/example.csv
@@ -0,0 +1,9 @@
+text
+"Imagine a cyberpunk close-up shot capturing the upper body of a character with an melancholic demeanor. The subject is gesturing with one hand while shaking the head, showcasing natural body language. The background features a vibrant carnival, complementing the character's pose. The lighting is dim and moody, emphasizing the contours of their face and upper body. The camera subtly pans or zooms, drawing attention to the harmony between expression, posture, and setting."
+"A sleek red sports car speeds through a winding mountain road, its engine roaring against the backdrop of towering snow-capped peaks. The sunlight glints off the polished surface, creating dazzling reflections. The camera pans to capture the lush greenery surrounding the road. The atmosphere is exhilarating, with a cinematic style emphasizing speed and adventure. The lighting is golden, suggesting early morning or late afternoon."
+"A group of fluffy baby chicks huddle together under a heat lamp in a rustic barn. Their soft peeping fills the air as they nudge each other for warmth. The wooden floor beneath them is strewn with straw, and the gentle light creates a cozy, heartwarming atmosphere. The video captures their tiny, detailed movements in a close-up, realistic style."
+"A black-and-white film of a pianist playing in an empty theater. His fingers move deftly across the keys, the music echoing in the large, empty hall. Dust motes float in the air, caught in the faint light streaming through the high windows. The grand piano gleams under the spotlight, contrasting with the decaying seats and peeling walls. The atmosphere is haunting and nostalgic."
+"A wave of glowing steam crashes into a stone wall, the vapor hissing and swirling as it dissipates."
+"A tomato surfing on a piece of lettuce down a waterfall of ranch dressing, with exaggerated surfing moves and creamy wave effects to highlight the 3D animated fun."
+"A cheerful panda on a bustling city street, casually playing a violin while sitting on a bench. People passing by stop to enjoy the impromptu performance, and a group of children dance around, clapping their hands to the upbeat tempo. The panda’s paws move swiftly, creating a lively tune that brings a sense of joy and energy to the urban scene."
+"A shimmering, crystalline city built into the side of a massive mountain on a distant planet. Waterfalls of liquid light cascade down the cliffs, with hovering bridges connecting the structures. The entire city glows as it absorbs energy from the planet’s core."
diff --git a/assets/texts/i2v.csv b/assets/texts/i2v.csv
new file mode 100644
index 0000000..32aaf2c
--- /dev/null
+++ b/assets/texts/i2v.csv
@@ -0,0 +1,2 @@
+text,ref
+"A plump pig wallows in a muddy pond on a rustic farm, its pink snout poking out as it snorts contentedly. The camera captures the pig's playful splashes, sending ripples through the water under the midday sun. Wooden fences and a red barn stand in the background, framed by rolling green hills. The pig's muddy coat glistens in the sunlight, showcasing the simple pleasures of its carefree life.",assets/texts/i2v.png
diff --git a/assets/texts/i2v.png b/assets/texts/i2v.png
new file mode 100644
index 0000000..a6439c3
Binary files /dev/null and b/assets/texts/i2v.png differ
diff --git a/assets/texts/sora.csv b/assets/texts/sora.csv
new file mode 100644
index 0000000..e6dc820
--- /dev/null
+++ b/assets/texts/sora.csv
@@ -0,0 +1,49 @@
+text
+"A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about."
+"Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field."
+"A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors."
+"Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway."
+"Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle. The art style is 3D and realistic, with a focus on lighting and texture. The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time. The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image."
+"A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures."
+"This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird's head is tilted slightly to the side, giving the impression of it looking regal and majestic. The background is blurred, drawing attention to the bird's striking appearance."
+Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
+"A young man at his 20s is sitting on a piece of cloud in the sky, reading a book."
+Historical footage of California during the gold rush.
+A close up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
+"Extreme close up of a 24 year old woman's eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70mm, depth of field, vivid colors, cinematic"
+A cartoon kangaroo disco dances.
+"A beautiful homemade video showing the people of Lagos, Nigeria in the year 2056. Shot with a mobile phone camera."
+A petri dish with a bamboo forest growing within it that has tiny red pandas running around.
+"The camera rotates around a large stack of vintage televisions all showing different programs — 1950s sci-fi movies, horror movies, news, static, a 1970s sitcom, etc, set inside a large New York museum gallery."
+"3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest."
+"The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it's tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds."
+Reflections in the window of a train traveling through the Tokyo suburbs.
+"A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast, the view showcases historic and magnificent architectural details and tiered pathways and patios, waves are seen crashing against the rocks below as the view overlooks the horizon of the coastal waters and hilly landscapes of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a magical and romantic feeling to the scene, the view is stunning captured with beautiful photography."
+"A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. Its tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock, its claws raised and ready to attack. The crab is brown and spiny, with long legs and antennae. The scene is captured from a wide angle, showing the vastness and depth of the ocean. The water is clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred, creating a depth of field effect."
+"A flock of paper airplanes flutters through a dense jungle, weaving around trees as if they were migrating birds."
+"A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer."
+Borneo wildlife on the Kinabatangan River
+A Chinese Lunar New Year celebration video with Chinese Dragon.
+Tour of an art gallery with many beautiful works of art in different styles.
+"Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes."
+A stop motion animation of a flower growing out of the windowsill of a suburban house.
+The story of a robot's life in a cyberpunk setting.
+"An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film."
+"A beautiful silhouette animation shows a wolf howling at the moon, feeling lonely, until it finds its pack."
+"New York City submerged like Atlantis. Fish, whales, sea turtles and sharks swim through the streets of New York."
+"A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in."
+"Step-printing scene of a person running, cinematic film shot in 35mm."
+"Five gray wolf pups frolicking and chasing each other around a remote gravel road, surrounded by grass. The pups run and leap, chasing each other, and nipping at each other, playing."
+Basketball through hoop then explodes.
+"Archeologists discover a generic plastic chair in the desert, excavating and dusting it with great care."
+"A grandmother with neatly combed grey hair stands behind a colorful birthday cake with numerous candles at a wood dining room table, expression is one of pure joy and happiness, with a happy glow in her eye. She leans forward and blows out the candles with a gentle puff, the cake has pink frosting and sprinkles and the candles cease to flicker, the grandmother wears a light blue blouse adorned with floral patterns, several happy friends and family sitting at the table can be seen celebrating, out of focus. The scene is beautifully captured, cinematic, showing a 3/4 view of the grandmother and the dining room. Warm color tones and soft lighting enhance the mood."
+The camera directly faces colorful buildings in Burano Italy. An adorable dalmation looks through a window on a building on the ground floor. Many people are walking and cycling along the canal streets in front of the buildings.
+"An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush tropical islands, 3D digital render art style."
+"This close-up shot of a chameleon showcases its striking color changing capabilities. The background is blurred, drawing attention to the animal's striking appearance."
+A corgi vlogging itself in tropical Maui.
+"A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates a warm contrast, accentuating the cat's orange fur. The shot is clear and sharp, with a shallow depth of field."
+"Aerial view of Santorini during the blue hour, showcasing the stunning architecture of white Cycladic buildings with blue domes. The caldera views are breathtaking, and the lighting creates a beautiful, serene atmosphere."
+"Tiltshift of a construction site filled with workers, equipment, and heavy machinery."
+"A giant, towering cloud in the shape of a man looms over the earth. The cloud man shoots lighting bolts down to the earth."
+A Samoyed and a Golden Retriever dog are playfully romping through a futuristic neon city at night. The neon lights emitted from the nearby buildings glistens off of their fur.
+"The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort William. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop for the train journey. The sky is blue and the sun is shining, making for a beautiful day to explore this majestic spot."
diff --git a/configs/diffusion/inference/256px.py b/configs/diffusion/inference/256px.py
new file mode 100644
index 0000000..8dc61b0
--- /dev/null
+++ b/configs/diffusion/inference/256px.py
@@ -0,0 +1,76 @@
+save_dir = "samples"  # save directory
+seed = 42  # random seed (except seed for z)
+batch_size = 1
+dtype = "bf16"
+
+cond_type = "t2v"
+# conditional inference options:
+# t2v: text-to-video
+# i2v_head: image-to-video (head)
+# i2v_tail: image-to-video (tail)
+# i2v_loop: connect images
+# v2v_head_half: video extension with first half
+# v2v_tail_half: video extension with second half
+
+dataset = dict(type="text")
+sampling_option = dict(
+    resolution="256px",  # 256px or 768px
+    aspect_ratio="16:9",  # 9:16 or 16:9 or 1:1
+    num_frames=129,  # number of frames
+    num_steps=50,  # number of steps
+    shift=True,
+    temporal_reduction=4,
+    is_causal_vae=True,
+    guidance=7.5,  # guidance for text-to-video
+    guidance_img=3.0,  # guidance for image-to-video
+    text_osci=True,  # enable text guidance oscillation
+    image_osci=True,  # enable image guidance oscillation
+    scale_temporal_osci=True,
+    method="i2v",  # hard-coded for now
+    seed=None,  # random seed for z
+)
+motion_score = "4"  # motion score for video generation
+fps_save = 24  # fps for video generation and saving
+
+# Define model components
+model = dict(
+    type="flux",
+    from_pretrained="./ckpts/Open_Sora_v2.safetensors",
+    guidance_embed=False,
+    fused_qkv=False,
+    use_liger_rope=True,
+    # model architecture
+    in_channels=64,
+    vec_in_dim=768,
+    context_in_dim=4096,
+    hidden_size=3072,
+    mlp_ratio=4.0,
+    num_heads=24,
+    depth=19,
+    depth_single_blocks=38,
+    axes_dim=[16, 56, 56],
+    theta=10_000,
+    qkv_bias=True,
+    cond_embed=True,
+)
+ae = dict(
+    type="hunyuan_vae",
+    from_pretrained="./ckpts/hunyuan_vae.safetensors",
+    in_channels=3,
+    out_channels=3,
+    layers_per_block=2,
+    latent_channels=16,
+    use_spatial_tiling=True,
+    use_temporal_tiling=False,
+)
+t5 = dict(
+    type="text_embedder",
+    from_pretrained="./ckpts/google/t5-v1_1-xxl",
+    max_length=512,
+    shardformer=True,
+)
+clip = dict(
+    type="text_embedder",
+    from_pretrained="./ckpts/openai/clip-vit-large-patch14",
+    max_length=77,
+)
diff --git a/configs/diffusion/inference/256px_tp.py b/configs/diffusion/inference/256px_tp.py
new file mode 100644
index 0000000..ac62d27
--- /dev/null
+++ b/configs/diffusion/inference/256px_tp.py
@@ -0,0 +1,4 @@
+_base_ = [  # inherit grammer from mmengine
+    "256px.py",
+    "plugins/tp.py",  # use tensor parallel
+]
diff --git a/configs/diffusion/inference/768px.py b/configs/diffusion/inference/768px.py
new file mode 100644
index 0000000..64e24ad
--- /dev/null
+++ b/configs/diffusion/inference/768px.py
@@ -0,0 +1,8 @@
+_base_ = [  # inherit grammer from mmengine
+    "256px.py",
+    "plugins/sp.py",  # use sequence parallel
+]
+
+sampling_option = dict(
+    resolution="768px",
+)
diff --git a/configs/diffusion/inference/high_compression.py b/configs/diffusion/inference/high_compression.py
new file mode 100644
index 0000000..72923e0
--- /dev/null
+++ b/configs/diffusion/inference/high_compression.py
@@ -0,0 +1,35 @@
+_base_ = ["t2i2v_768px.py"]
+
+# no need for parallelism
+plugin = None
+plugin_config = None
+plugin_ae = None
+plugin_config_ae = None
+
+# model settings
+patch_size = 1
+model = dict(
+    from_pretrained="./ckpts/Open_Sora_v2_Video_DC_AE.safetensors",
+    in_channels=128,
+    cond_embed=True,
+    patch_size=1,
+)
+
+# AE settings
+ae = dict(
+    _delete_=True,
+    type="dc_ae",
+    from_scratch=True,
+    model_name="dc-ae-f32t4c128",
+    from_pretrained="./ckpts/F32T4C128_AE.safetensors",
+    use_spatial_tiling=True,
+    use_temporal_tiling=True,
+    spatial_tile_size=256,
+    temporal_tile_size=32,
+    tile_overlap_factor=0.25,
+)
+ae_spatial_compression = 32
+
+sampling_option = dict(
+    num_frames=128,
+)
diff --git a/configs/diffusion/inference/plugins/sp.py b/configs/diffusion/inference/plugins/sp.py
new file mode 100644
index 0000000..f1d3977
--- /dev/null
+++ b/configs/diffusion/inference/plugins/sp.py
@@ -0,0 +1,20 @@
+plugin = "hybrid"
+plugin_config = dict(
+    tp_size=1,
+    pp_size=1,
+    sp_size=8,
+    sequence_parallelism_mode="ring_attn",
+    enable_sequence_parallelism=True,
+    static_graph=True,
+    zero_stage=2,
+    overlap_allgather=False,
+)
+
+plugin_ae = "hybrid"
+plugin_config_ae = dict(
+    tp_size=8,
+    pp_size=1,
+    sp_size=1,
+    zero_stage=2,
+    overlap_allgather=False,
+)
diff --git a/configs/diffusion/inference/plugins/t2i2v.py b/configs/diffusion/inference/plugins/t2i2v.py
new file mode 100644
index 0000000..37dab6d
--- /dev/null
+++ b/configs/diffusion/inference/plugins/t2i2v.py
@@ -0,0 +1,36 @@
+use_t2i2v = True
+
+# flux configurations
+img_flux = dict(
+    type="flux",
+    from_pretrained="./ckpts/flux1-dev.safetensors",
+    guidance_embed=True,
+    # model architecture
+    in_channels=64,
+    vec_in_dim=768,
+    context_in_dim=4096,
+    hidden_size=3072,
+    mlp_ratio=4.0,
+    num_heads=24,
+    depth=19,
+    depth_single_blocks=38,
+    axes_dim=[16, 56, 56],
+    theta=10_000,
+    qkv_bias=True,
+    cond_embed=False,  # pass i2v & v2v info, for t2v need this layer too but with x_cond and mask all set to 0
+)
+
+img_flux_ae = dict(
+    type="autoencoder_2d",
+    from_pretrained="./ckpts/flux1-dev-ae.safetensors",
+    resolution=256,
+    in_channels=3,
+    ch=128,
+    out_ch=3,
+    ch_mult=[1, 2, 4, 4],
+    num_res_blocks=2,
+    z_channels=16,
+    scale_factor=0.3611,
+    shift_factor=0.1159,
+)
+img_resolution = "768px"
diff --git a/configs/diffusion/inference/plugins/tp.py b/configs/diffusion/inference/plugins/tp.py
new file mode 100644
index 0000000..e5a89cd
--- /dev/null
+++ b/configs/diffusion/inference/plugins/tp.py
@@ -0,0 +1,17 @@
+plugin = "hybrid"
+plugin_config = dict(
+    tp_size=8,
+    pp_size=1,
+    sp_size=1,
+    zero_stage=2,
+    overlap_allgather=False,
+)
+
+plugin_ae = "hybrid"
+plugin_config_ae = dict(
+    tp_size=8,
+    pp_size=1,
+    sp_size=1,
+    zero_stage=2,
+    overlap_allgather=False,
+)
diff --git a/configs/diffusion/inference/t2i2v_256px.py b/configs/diffusion/inference/t2i2v_256px.py
new file mode 100644
index 0000000..9e2106b
--- /dev/null
+++ b/configs/diffusion/inference/t2i2v_256px.py
@@ -0,0 +1,4 @@
+_base_ = [  # inherit grammer from mmengine
+    "256px.py",
+    "plugins/t2i2v.py",
+]
diff --git a/configs/diffusion/inference/t2i2v_768px.py b/configs/diffusion/inference/t2i2v_768px.py
new file mode 100644
index 0000000..933dd49
--- /dev/null
+++ b/configs/diffusion/inference/t2i2v_768px.py
@@ -0,0 +1,4 @@
+_base_ = [  # inherit grammer from mmengine
+    "768px.py",
+    "plugins/t2i2v.py",
+]
diff --git a/configs/diffusion/train/demo.py b/configs/diffusion/train/demo.py
new file mode 100644
index 0000000..cef92f5
--- /dev/null
+++ b/configs/diffusion/train/demo.py
@@ -0,0 +1,12 @@
+_base_ = ["stage1.py"]
+
+
+bucket_config = {
+    "_delete_": True,
+    "256px": {
+        1: (1.0, 1),
+        33: (1.0, 1),
+        97: (1.0, 1),
+        129: (1.0, 1),
+    },
+}
diff --git a/configs/diffusion/train/high_compression.py b/configs/diffusion/train/high_compression.py
new file mode 100644
index 0000000..d42b727
--- /dev/null
+++ b/configs/diffusion/train/high_compression.py
@@ -0,0 +1,71 @@
+_base_ = ["image.py"]
+
+bucket_config = {
+    "_delete_": True,
+    "768px": {
+        1: (1.0, 20),
+        16: (1.0, 8),
+        20: (1.0, 8),
+        24: (1.0, 8),
+        28: (1.0, 8),
+        32: (1.0, 8),
+        36: (1.0, 4),
+        40: (1.0, 4),
+        44: (1.0, 4),
+        48: (1.0, 4),
+        52: (1.0, 4),
+        56: (1.0, 4),
+        60: (1.0, 4),
+        64: (1.0, 4),
+        68: (1.0, 3),
+        72: (1.0, 3),
+        76: (1.0, 3),
+        80: (1.0, 3),
+        84: (1.0, 3),
+        88: (1.0, 3),
+        92: (1.0, 3),
+        96: (1.0, 3),
+        100: (1.0, 2),
+        104: (1.0, 2),
+        108: (1.0, 2),
+        112: (1.0, 2),
+        116: (1.0, 2),
+        120: (1.0, 2),
+        124: (1.0, 2),
+        128: (1.0, 2),  # 30s
+    },
+}
+
+condition_config = dict(
+    t2v=1,
+    i2v_head=7,
+)
+
+grad_ckpt_settings = (100, 100)
+patch_size = 1
+model = dict(
+    from_pretrained=None,
+    grad_ckpt_settings=grad_ckpt_settings,
+    in_channels=128,
+    cond_embed=True,
+    patch_size=patch_size,
+)
+ae = dict(
+    _delete_=True,
+    type="dc_ae",
+    model_name="dc-ae-f32t4c128",
+    from_pretrained="./ckpts/F32T4C128_AE.safetensors",
+    from_scratch=True,
+    scaling_factor=0.493,
+    use_spatial_tiling=True,
+    use_temporal_tiling=True,
+    spatial_tile_size=256,
+    temporal_tile_size=32,
+    tile_overlap_factor=0.25,
+)
+is_causal_vae = False
+ae_spatial_compression = 32
+
+ckpt_every = 250
+lr = 3e-5
+optim = dict(lr=lr)
diff --git a/configs/diffusion/train/image.py b/configs/diffusion/train/image.py
new file mode 100644
index 0000000..0d64a07
--- /dev/null
+++ b/configs/diffusion/train/image.py
@@ -0,0 +1,114 @@
+# Dataset settings
+dataset = dict(
+    type="video_text",
+    transform_name="resize_crop",
+    fps_max=24,  # the desired fps for training
+    vmaf=True,  # load vmaf scores into text
+)
+
+grad_ckpt_settings = (8, 100)  # set the grad checkpoint settings
+bucket_config = {
+    "256px": {1: (1.0, 50)},
+    "768px": {1: (0.5, 11)},
+    "1024px": {1: (0.5, 7)},
+}
+
+# Define model components
+model = dict(
+    type="flux",
+    from_pretrained=None,
+    strict_load=False,
+    guidance_embed=False,
+    fused_qkv=False,
+    use_liger_rope=True,
+    grad_ckpt_settings=grad_ckpt_settings,
+    # model architecture
+    in_channels=64,
+    vec_in_dim=768,
+    context_in_dim=4096,
+    hidden_size=3072,
+    mlp_ratio=4.0,
+    num_heads=24,
+    depth=19,
+    depth_single_blocks=38,
+    axes_dim=[16, 56, 56],
+    theta=10_000,
+    qkv_bias=True,
+)
+dropout_ratio = {  # probability for dropout text embedding
+    "t5": 0.31622777,
+    "clip": 0.31622777,
+}
+ae = dict(
+    type="hunyuan_vae",
+    from_pretrained="./ckpts/hunyuan_vae.safetensors",
+    in_channels=3,
+    out_channels=3,
+    layers_per_block=2,
+    latent_channels=16,
+    use_spatial_tiling=True,
+    use_temporal_tiling=False,
+)
+is_causal_vae = True
+t5 = dict(
+    type="text_embedder",
+    from_pretrained="google/t5-v1_1-xxl",
+    cache_dir="/mnt/ddn/sora/tmp_load/huggingface/hub/",
+    max_length=512,
+    shardformer=True,
+)
+clip = dict(
+    type="text_embedder",
+    from_pretrained="openai/clip-vit-large-patch14",
+    cache_dir="/mnt/ddn/sora/tmp_load/huggingface/hub/",
+    max_length=77,
+)
+
+# Optimization settings
+lr = 1e-5
+eps = 1e-15
+optim = dict(
+    cls="HybridAdam",
+    lr=lr,
+    eps=eps,
+    weight_decay=0.0,
+    adamw_mode=True,
+)
+warmup_steps = 0
+update_warmup_steps = True
+
+grad_clip = 1.0
+accumulation_steps = 1
+ema_decay = None
+
+# Acceleration settings
+prefetch_factor = 2
+num_workers = 12
+num_bucket_build_workers = 64
+dtype = "bf16"
+plugin = "zero2"
+grad_checkpoint = True
+plugin_config = dict(
+    reduce_bucket_size_in_m=128,
+    overlap_allgather=False,
+)
+pin_memory_cache_pre_alloc_numels = [(260 + 20) * 1024 * 1024] * 24 + [
+    (34 + 20) * 1024 * 1024
+] * 4
+async_io = False
+
+# Other settings
+seed = 42
+outputs = "outputs"
+epochs = 1000
+log_every = 10
+ckpt_every = 100
+keep_n_latest = 20
+wandb_project = "mmdit"
+
+save_master_weights = True
+load_master_weights = True
+
+# For debugging
+# record_time = True
+# record_barrier = True
diff --git a/configs/diffusion/train/stage1.py b/configs/diffusion/train/stage1.py
new file mode 100644
index 0000000..6ee1bc8
--- /dev/null
+++ b/configs/diffusion/train/stage1.py
@@ -0,0 +1,56 @@
+_base_ = ["image.py"]
+
+dataset = dict(memory_efficient=False)
+
+# new config
+grad_ckpt_settings = (8, 100)
+bucket_config = {
+    "_delete_": True,
+    "256px": {
+        1: (1.0, 45),
+        5: (1.0, 12),
+        9: (1.0, 12),
+        13: (1.0, 12),
+        17: (1.0, 12),
+        21: (1.0, 12),
+        25: (1.0, 12),
+        29: (1.0, 12),
+        33: (1.0, 12),
+        37: (1.0, 6),
+        41: (1.0, 6),
+        45: (1.0, 6),
+        49: (1.0, 6),
+        53: (1.0, 6),
+        57: (1.0, 6),
+        61: (1.0, 6),
+        65: (1.0, 6),
+        69: (1.0, 4),
+        73: (1.0, 4),
+        77: (1.0, 4),
+        81: (1.0, 4),
+        85: (1.0, 4),
+        89: (1.0, 4),
+        93: (1.0, 4),
+        97: (1.0, 4),
+        101: (1.0, 3),
+        105: (1.0, 3),
+        109: (1.0, 3),
+        113: (1.0, 3),
+        117: (1.0, 3),
+        121: (1.0, 3),
+        125: (1.0, 3),
+        129: (1.0, 3),
+    },
+    "768px": {
+        1: (0.5, 13),
+    },
+    "1024px": {
+        1: (0.5, 7),
+    },
+}
+
+model = dict(grad_ckpt_settings=grad_ckpt_settings)
+lr = 5e-5
+optim = dict(lr=lr)
+ckpt_every = 2000
+keep_n_latest = 20
diff --git a/configs/diffusion/train/stage1_i2v.py b/configs/diffusion/train/stage1_i2v.py
new file mode 100644
index 0000000..0e0f927
--- /dev/null
+++ b/configs/diffusion/train/stage1_i2v.py
@@ -0,0 +1,14 @@
+_base_ = ["stage1.py"]
+
+# Define model components
+model = dict(cond_embed=True)
+
+condition_config = dict(
+    t2v=1,
+    i2v_head=5,  # train i2v (image as first frame) with weight 5
+    i2v_loop=1,  # train image connection with weight 1
+    i2v_tail=1,  # train i2v (image as last frame) with weight 1
+)
+
+lr = 1e-5
+optim = dict(lr=lr)
diff --git a/configs/diffusion/train/stage2.py b/configs/diffusion/train/stage2.py
new file mode 100644
index 0000000..8329456
--- /dev/null
+++ b/configs/diffusion/train/stage2.py
@@ -0,0 +1,94 @@
+_base_ = ["image.py"]
+
+# new config
+grad_ckpt_settings = (100, 100)
+
+plugin = "hybrid"
+plugin_config = dict(
+    tp_size=1,
+    pp_size=1,
+    sp_size=4,
+    sequence_parallelism_mode="ring_attn",
+    enable_sequence_parallelism=True,
+    static_graph=True,
+    zero_stage=2,
+)
+
+bucket_config = {
+    "_delete_": True,
+    "256px": {
+        1: (1.0, 130),
+        5: (1.0, 14),
+        9: (1.0, 14),
+        13: (1.0, 14),
+        17: (1.0, 14),
+        21: (1.0, 14),
+        25: (1.0, 14),
+        29: (1.0, 14),
+        33: (1.0, 14),
+        37: (1.0, 10),
+        41: (1.0, 10),
+        45: (1.0, 10),
+        49: (1.0, 10),
+        53: (1.0, 10),
+        57: (1.0, 10),
+        61: (1.0, 10),
+        65: (1.0, 10),
+        73: (1.0, 7),
+        77: (1.0, 7),
+        81: (1.0, 7),
+        85: (1.0, 7),
+        89: (1.0, 7),
+        93: (1.0, 7),
+        97: (1.0, 7),
+        101: (1.0, 6),
+        105: (1.0, 6),
+        109: (1.0, 6),
+        113: (1.0, 6),
+        117: (1.0, 6),
+        121: (1.0, 6),
+        125: (1.0, 6),
+        129: (1.0, 6),
+    },
+    "768px": {
+        1: (1.0, 38),
+        5: (1.0, 6),
+        9: (1.0, 6),
+        13: (1.0, 6),
+        17: (1.0, 6),
+        21: (1.0, 6),
+        25: (1.0, 6),
+        29: (1.0, 6),
+        33: (1.0, 6),
+        37: (1.0, 4),
+        41: (1.0, 4),
+        45: (1.0, 4),
+        49: (1.0, 4),
+        53: (1.0, 4),
+        57: (1.0, 4),
+        61: (1.0, 4),
+        65: (1.0, 4),
+        69: (1.0, 3),
+        73: (1.0, 3),
+        77: (1.0, 3),
+        81: (1.0, 3),
+        85: (1.0, 3),
+        89: (1.0, 3),
+        93: (1.0, 3),
+        97: (1.0, 3),
+        101: (1.0, 2),
+        105: (1.0, 2),
+        109: (1.0, 2),
+        113: (1.0, 2),
+        117: (1.0, 2),
+        121: (1.0, 2),
+        125: (1.0, 2),
+        129: (1.0, 2),
+    },
+}
+
+model = dict(grad_ckpt_settings=grad_ckpt_settings)
+lr = 5e-5
+optim = dict(lr=lr)
+ckpt_every = 200
+keep_n_latest = 20
diff --git a/configs/diffusion/train/stage2_i2v.py b/configs/diffusion/train/stage2_i2v.py
new file mode 100644
index 0000000..fc9e86d
--- /dev/null
+++ b/configs/diffusion/train/stage2_i2v.py
@@ -0,0 +1,87 @@
+_base_ = ["stage2.py"]
+
+# Define model components
+model = dict(cond_embed=True)
+grad_ckpt_buffer_size = 25 * 1024**3
+
+condition_config = dict(
+    t2v=1,
+    i2v_head=5,
+    i2v_loop=1,
+    i2v_tail=1,
+)
+is_causal_vae = True
+
+bucket_config = {
+    "_delete_": True,
+    "256px": {
+        1: (1.0, 195),
+        5: (1.0, 80),
+        9: (1.0, 80),
+        13: (1.0, 80),
+        17: (1.0, 80),
+        21: (1.0, 80),
+        25: (1.0, 80),
+        29: (1.0, 80),
+        33: (1.0, 80),
+        37: (1.0, 40),
+        41: (1.0, 40),
+        45: (1.0, 40),
+        49: (1.0, 40),
+        53: (1.0, 40),
+        57: (1.0, 40),
+        61: (1.0, 40),
+        65: (1.0, 40),
+        69: (1.0, 28),
+        73: (1.0, 28),
+        77: (1.0, 28),
+        81: (1.0, 28),
+        85: (1.0, 28),
+        89: (1.0, 28),
+        93: (1.0, 28),
+        97: (1.0, 28),
+        101: (1.0, 23),
+        105: (1.0, 23),
+        109: (1.0, 23),
+        113: (1.0, 23),
+        117: (1.0, 23),
+        121: (1.0, 23),
+        125: (1.0, 23),
+        129: (1.0, 23),
+    },
+    "768px": {
+        1: (0.5, 38),
+        5: (0.5, 10),
+        9: (0.5, 10),
+        13: (0.5, 10),
+        17: (0.5, 10),
+        21: (0.5, 10),
+        25: (0.5, 10),
+        29: (0.5, 10),
+        33: (0.5, 10),
+        37: (0.5, 5),
+        41: (0.5, 5),
+        45: (0.5, 5),
+        49: (0.5, 5),
+        53: (0.5, 5),
+        57: (0.5, 5),
+        61: (0.5, 5),
+        65: (0.5, 5),
+        69: (0.5, 3),
+        73: (0.5, 3),
+        77: (0.5, 3),
+        81: (0.5, 3),
+        85: (0.5, 3),
+        89: (0.5, 3),
+        93: (0.5, 3),
+        97: (0.5, 3),
+        101: (0.5, 2),
+        105: (0.5, 2),
+        109: (0.5, 2),
+        113: (0.5, 2),
+        117: (0.5, 2),
+        121: (0.5, 2),
+        125: (0.5, 2),
+        129: (0.5, 2),
+    },
+}
diff --git a/configs/vae/inference/hunyuanvideo_vae.py b/configs/vae/inference/hunyuanvideo_vae.py
new file mode 100644
index 0000000..1f53724
--- /dev/null
+++ b/configs/vae/inference/hunyuanvideo_vae.py
@@ -0,0 +1,33 @@
+dtype = "bf16"
+batch_size = 1
+seed = 42
+save_dir = "samples/hunyuanvideo_vae"
+
+plugin = "zero2"
+dataset = dict(
+    type="video_text",
+    transform_name="resize_crop",
+    fps_max=16,
+    data_path="datasets/pexels_45k_necessary.csv",
+)
+bucket_config = {
+    "512px_ar1:1": {97: (1.0, 1)},
+}
+
+num_workers = 24
+num_bucket_build_workers = 16
+prefetch_factor = 4
+
+model = dict(
+    type="hunyuan_vae",
+    from_pretrained="./ckpts/hunyuan_vae.safetensors",
+    in_channels=3,
+    out_channels=3,
+    layers_per_block=2,
+    latent_channels=16,
+    scale_factor=0.476986,
+    shift_factor=0,
+    use_spatial_tiling=True,
+    use_temporal_tiling=True,
+    time_compression_ratio=4,
+)
diff --git a/configs/vae/inference/video_dc_ae.py b/configs/vae/inference/video_dc_ae.py
new file mode 100644
index 0000000..4a4055f
--- /dev/null
+++ b/configs/vae/inference/video_dc_ae.py
@@ -0,0 +1,32 @@
+dtype = "bf16"
+batch_size = 1
+seed = 42
+
+dataset = dict(
+    type="video_text",
+    transform_name="resize_crop",
+    fps_max=16,
+    data_path="datasets/pexels_45k_necessary.csv",
+)
+bucket_config = {
+    "512px_ar1:1": {96: (1.0, 1)},
+}
+
+model = dict(
+    type="dc_ae",
+    model_name="dc-ae-f32t4c128",
+    from_pretrained="./ckpts/F32T4C128_AE.safetensors",
+    from_scratch=True,
+    use_spatial_tiling=True,
+    use_temporal_tiling=True,
+    spatial_tile_size=256,
+    temporal_tile_size=32,
+    tile_overlap_factor=0.25,
+)
+
+save_dir = "samples/video_dc_ae"
+
+num_workers = 24
+num_bucket_build_workers = 16
+prefetch_factor = 4
+
diff --git a/configs/vae/train/video_dc_ae.py b/configs/vae/train/video_dc_ae.py
new file mode 100644
index 0000000..d8eac1c
--- /dev/null
+++ b/configs/vae/train/video_dc_ae.py
@@ -0,0 +1,74 @@
+# ============
+# model config 
+# ============
+model = dict(
+    type="dc_ae",
+    model_name="dc-ae-f32t4c128",
+    from_scratch=True,
+    from_pretrained=None,
+)
+
+# ============
+# data config 
+# ============
+dataset = dict(
+    type="video_text",
+    transform_name="resize_crop",
+    data_path="datasets/pexels_45k_necessary.csv",
+    fps_max=24,
+)
+
+bucket_config = {
+    "256px_ar1:1": {32: (1.0, 1)},
+}
+
+num_bucket_build_workers = 64
+num_workers = 12
+prefetch_factor = 2
+
+# ============
+# train config 
+# ============
+optim = dict(
+    cls="HybridAdam",
+    lr=5e-5,
+    eps=1e-8,
+    weight_decay=0.0,
+    adamw_mode=True,
+    betas=(0.9, 0.98),
+)
+lr_scheduler = dict(warmup_steps=0)
+
+mixed_strategy = "mixed_video_image"
+mixed_image_ratio = 0.2  # 1:4
+
+dtype = "bf16"
+plugin = "zero2"
+plugin_config = dict(
+    reduce_bucket_size_in_m=128,
+    overlap_allgather=False,
+)
+
+grad_clip = 1.0
+grad_checkpoint = False
+pin_memory_cache_pre_alloc_numels = [50 * 1024 * 1024] * num_workers * prefetch_factor
+
+seed = 42
+outputs = "outputs"
+epochs = 100
+log_every = 10
+ckpt_every = 3000
+keep_n_latest = 50
+ema_decay = 0.99
+wandb_project = "dcae"
+
+update_warmup_steps = True
+
+# ============
+# loss config 
+# ============
+vae_loss_config = dict(
+    perceptual_loss_weight=0.5,
+    kl_loss_weight=0,
+)
+
diff --git a/configs/vae/train/video_dc_ae_disc.py b/configs/vae/train/video_dc_ae_disc.py
new file mode 100644
index 0000000..c370ec3
--- /dev/null
+++ b/configs/vae/train/video_dc_ae_disc.py
@@ -0,0 +1,34 @@
+_base_ = ["video_dc_ae.py"]
+
+discriminator = dict(
+    type="N_Layer_discriminator_3D",
+    from_pretrained=None,
+    input_nc=3,
+    n_layers=5,
+    conv_cls="conv3d"
+)
+disc_lr_scheduler = dict(warmup_steps=0)
+
+gen_loss_config = dict(
+    gen_start=0,
+    disc_weight=0.05,
+)
+
+disc_loss_config = dict(
+    disc_start=0,
+    disc_loss_type="hinge",
+)
+
+optim_discriminator = dict(
+    cls="HybridAdam",
+    lr=1e-4,
+    eps=1e-8,
+    weight_decay=0.0,
+    adamw_mode=True,
+    betas=(0.9, 0.98),
+)
+
+grad_checkpoint = True
+model = dict(
+    disc_off_grad_ckpt = True, # set to true if your `grad_checkpoint` is True
+)
diff --git a/docs/ae.md b/docs/ae.md
new file mode 100644
index 0000000..84e2ad6
--- /dev/null
+++ b/docs/ae.md
@@ -0,0 +1,154 @@
+# Step by step to train and evaluate an video autoencoder (AE)
+Inspired by [SANA](https://arxiv.org/abs/2410.10629), we aim to drastically increase the compression ratio in the AE. We propose a video autoencoder architecture based on [DC-AE](https://github.com/mit-han-lab/efficientvit), the __Video DC-AE__, which compression the video by 4x in the temporal dimension and 32x32 in the spatial dimension. Compared to [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)'s VAE of 4x8x8, our proposed AE has a much higher spatial compression ratio.
+Thus, we can effectively reduce the token length in the diffusion model by a total of 16x (assuming the same patch sizes), drastically increase both training and inference speed.
+
+## Data Preparation
+
+Follow this [guide](./train.md#prepare-dataset) to prepare the __DATASET__ for training and inference. You may use our provided dataset or custom ones.
+
+To use custom dataset, pass the argument `--dataset.data_path <your_data_path>` to the following training or inference command.
+
+## Training
+
+We train our __Video DC-AE__ from scratch on 8xGPUs for 3 weeks.
+
+We first train with the following command:
+
+```bash
+torchrun --nproc_per_node 8 scripts/vae/train.py configs/vae/train/video_dc_ae.py
+```
+
+When the model is almost converged, we add a discriminator and continue to train the model with the checkpoint `model_ckpt` using the following command:
+
+```bash
+torchrun --nproc_per_node 8 scripts/vae/train.py configs/vae/train/video_dc_ae_disc.py --model.from_pretrained <model_ckpt>
+```
+You may pass the flag `--wandb True` if you have a [wandb](https://wandb.ai/home) account and wish to track the training progress online.
+
+## Inference
+
+Download the relevant weights following [this guide](../README.md#model-download). Alternatively, you may use your own trained model by passing the following flag `--model.from_pretrained <your_model_ckpt_path>`.
+
+### Video DC-AE
+
+Use the following code to reconstruct the videos using our trained `Video DC-AE`:
+
+```bash
+torchrun --nproc_per_node 1 --standalone scripts/vae/inference.py configs/vae/inference/video_dc_ae.py --save-dir samples/dcae
+```
+
+### Hunyuan Video
+
+Alternatively, we have incorporated [HunyuanVideo vae](https://github.com/Tencent/HunyuanVideo) into our code, you may run inference with the following command:
+
+```bash
+torchrun --nproc_per_node 1 --standalone scripts/vae/inference.py configs/vae/inference/hunyuanvideo_vae.py --save-dir samples/hunyuanvideo_vae
+```
+
+## Config Interpretation
+
+All AE configs are located in `configs/vae/`, divided into configs for training (`configs/vae/train`) and for inference (`configs/vae/inference`).
+
+### Training Config
+
+For training, the same config rules as [those](./train.md#config) for the diffusion model are applied.
+
+<details>
+<summary> <b>Loss Config</b> </summary>
+Our __Video DC-AE__ is based on the [DC-AE](https://github.com/mit-han-lab/efficientvit) architecture, which doesn't have a variational component. Thus, our training simply composes of the *reconstruction loss* and the *perceptual loss*.
+Experimentally, we found that setting a ratio of 0.5 for the perceptual loss is effective.
+
+```python
+vae_loss_config = dict(
+    perceptual_loss_weight=0.5, # weigh the perceptual loss by 0.5
+    kl_loss_weight=0,           # no KL loss
+)
+```
+
+In a later stage, we include a discriminator, and the training loss for the ae has an additional generator loss component, where we use a small ratio of 0.05 to weigh the loss calculated:
+```python
+gen_loss_config = dict(
+    gen_start=0,                # include generator loss from step 0 onwards          
+    disc_weight=0.05,           # weigh the loss by 0.05
+)
+```
+
+The discriminator we use is trained from scratch, and it's loss is simply the hinged loss:
+```python
+disc_loss_config = dict(
+    disc_start=0,               # update the discriminator from step 0 onwards
+    disc_loss_type="hinge",     # the discriminator loss type
+)
+```
+</details>
+
+<details>
+<summary> <b> Data Bucket Config </b> </summary>
+For the data bucket, we used 32 frames of 256px videos to train our AE.
+```python
+bucket_config = {
+    "256px_ar1:1": {32: (1.0, 1)},
+}
+```
+</details>
+
+<details>
+<summary> <b>Train with more frames or higher resolutions</b> </summary>
+
+If you train with longer frames or larger resolutions, you may increase the `spatial_tile_size` and `temporal_tile_size` during inference without degrading the AE performance (see [Inference Config](ae.md#inference-config)). This may give you advantage of faster AE inference such as when training the diffusion model (although at the cost of slower AE training). 
+
+You may increase the video frames to 96 (although multiples of 4 works, we generally recommend to use frame numbers of multiples of 32):
+
+```python
+bucket_config = {
+    "256px_ar1:1": {96: (1.0, 1)},
+}
+grad_checkpoint = True
+```
+or train for higher resolution such as 512px:
+```python
+bucket_config = {
+    "512px_ar1:1": {32: (1.0, 1)},
+}
+grad_checkpoint = True
+```
+Note that gradient checkpoint needs to be turned on in order to avoid prevent OOM error.
+
+Moreover, if `grad_checkpointing` is set to `True` in discriminator training, you need to pass the flag `--model.disc_off_grad_ckpt True` or simply set in the config:
+```python
+grad_checkpoint = True
+model = dict(
+    disc_off_grad_ckpt = True, # set to true if your `grad_checkpoint` is True
+)
+```
+This is to make sure the discriminator loss will have a gradient at the laster later during adaptive loss calculation.
+</details>
+
+
+
+
+### Inference Config
+
+For AE inference, we have replicated the tiling mechanism in hunyuan to our Video DC-AE, which can be turned on with the following:
+
+```python
+model = dict(
+    ...,
+    use_spatial_tiling=True,
+    use_temporal_tiling=True,
+    spatial_tile_size=256,
+    temporal_tile_size=32,
+    tile_overlap_factor=0.25,
+    ...,
+)
+```
+
+By default, both spatial tiling and temporal tiling are turned on for the best performance.
+Since our Video DC-AE is trained on 256px videos of 32 frames only, `spatial_tile_size` should be set to 256 and `temporal_tile_size` should be set to 32.
+If you train your own Video DC-AE with other resolutions and length, you may adjust the values accordingly.
+
+You can specify the directory to store output samples with `--save_dir <your_dir>` or setting it in config, for instance:
+
+```python
+save_dir = "./samples"
+```
diff --git a/docs/hcae.md b/docs/hcae.md
new file mode 100644
index 0000000..1ea6cb5
--- /dev/null
+++ b/docs/hcae.md
@@ -0,0 +1,38 @@
+# 10× inference speedup with high-compression autoencoder
+
+
+The high computational cost of training video generation models arises from the
+large number of tokens and the dominance of attention computation. To further reduce training expenses,
+we explore training video generation models with high-compression autoencoders (Video DC-AEs). As shown in the comparason below, by switching to the Video DC-AE with a much higher downsample ratio (4 x 32 x 32), we can afford to further reduce the patch size to 1 and still achieve __5.2× speedup in training throughput__ and __10x speedup during inference__:
+
+![opensorav2_speed](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/hcae_opensorav2_speed.png)
+
+
+Nevertheless, despite the advantanges in drastically lower computation costs, other challenges remain. For instance, larger channels low down convergance. Our generation model adapted with a 128-channel Video DC-AE for 25K iterations achieves a loss level of 0.5, as compared to 0.1 from the initialization model. While the fast video generation model underperforms the original, it still captures spatial-temporal
+relationships. We release this model to the research community for further exploration.
+
+Checkout more details in our [report](https://arxiv.org/abs/2503.09642v1).
+
+## Model Download
+
+Download from 🤗 [Huggingface](https://huggingface.co/hpcai-tech/Open-Sora-v2-Video-DC-AE):
+
+```bash
+pip install "huggingface_hub[cli]"
+huggingface-cli download hpcai-tech/Open-Sora-v2-Video-DC-AE --local-dir ./ckpts
+```
+
+## Inference
+
+To inference on our fast video generation model:
+
+```bash
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/high_compression.py --prompt "The story of a robot's life in a cyberpunk setting." 
+```
+
+## Training
+Follow this [guide](./train.md#prepare-dataset) to parepare the __DATASET__ for training.
+Then, you may train your own fast generation model with the following command:
+```bash
+torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/high_compression.py --dataset.data-path datasets/pexels_45k_necessary.csv
+```
diff --git a/docs/report_01.md b/docs/report_01.md
new file mode 100644
index 0000000..af9af36
--- /dev/null
+++ b/docs/report_01.md
@@ -0,0 +1,49 @@
+# Open-Sora 1.0 Report
+
+OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model.
+
+## Efficiency in choosing the architecture
+
+To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version.
+
+The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte).
+
+As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit).
+
+![Architecture Comparison](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch_comp.png)
+
+To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.
+
+![Architecture](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch.jpg)
+
+Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.
+
+We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training.
+
+## Data is the key to high quality
+
+We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions.
+
+![Caption](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_caption.png)
+
+As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version.
+
+## Training Details
+
+With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works.
+
+## Loss curves
+
+16x256x256 Pretraining Loss Curve
+
+![16x256x256 Pretraining Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_1.png)
+
+16x256x256 HQ Training Loss Curve
+
+![16x256x256 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_2.png)
+
+16x512x512 HQ Training Loss Curve
+
+![16x512x512 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_3.png)
+
+> Core Contributor: Zangwei Zheng*, Xiangyu Peng*, Shenggui Li, Hongxing Liu, Yang You
diff --git a/docs/report_02.md b/docs/report_02.md
new file mode 100644
index 0000000..b04cf4a
--- /dev/null
+++ b/docs/report_02.md
@@ -0,0 +1,117 @@
+# Open-Sora 1.1 Report
+
+- [Model Architecture Modification](#model-architecture-modification)
+- [Support for Multi-time/resolution/aspect ratio/fps Training](#support-for-multi-timeresolutionaspect-ratiofps-training)
+- [Masked DiT as Image/Video-to-Video Model](#masked-dit-as-imagevideo-to-video-model)
+- [Data Collection \& Pipeline](#data-collection--pipeline)
+- [Training Details](#training-details)
+- [Limitation and Future Work](#limitation-and-future-work)
+
+In Open-Sora 1.1 release, we train a 700M models on 10M data (Open-Sora 1.0 trained on 400K data) with a better STDiT architecture. We implement the following features mentioned in [sora's report](https://openai.com/research/video-generation-models-as-world-simulators):
+
+- Variable durations, resolutions, aspect ratios (Sampling flexibility, Improved framing and composition)
+- Prompting with images and videos (Animating images, Extending generated videos, Video-to-video editing, Connecting videos)
+- Image generation capabilities
+
+To achieve this goal, we use multi-task learning in the pretraining stage. For diffusion models, training with different sampled timestep is already a multi-task learning. We further extend this idea to multi-resolution, aspect ratio, frame length, fps, and different mask strategies for image and video conditioned generation. We train the model on **0s~15s, 144p to 720p, various aspect ratios** videos. Although the quality of time consistency is not that high due to limit training FLOPs, we can still see the potential of the model.
+
+## Model Architecture Modification
+
+We made the following modifications to the original ST-DiT for better training stability and performance (ST-DiT-2):
+
+- **[Rope embedding](https://arxiv.org/abs/2104.09864) for temporal attention**: Following LLM's best practice, we change the sinusoidal positional encoding to rope embedding for temporal attention since it is also a sequence prediction task.
+- **AdaIN and Layernorm for temporal attention**: we wrap the temporal attention with AdaIN and layernorm as the spatial attention to stabilize the training.
+- **[QK-normalization](https://arxiv.org/abs/2302.05442) with [RMSNorm](https://arxiv.org/abs/1910.07467)**: Following [SD3](https://arxiv.org/pdf/2403.03206.pdf), we apply QK-normalization to the all attention for better training stability in half-precision.
+- **Dynamic input size support and video infomation condition**: To support multi-resolution, aspect ratio, and fps training, we make ST-DiT-2 to accept any input size, and automatically scale positional embeddings. Extending [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)'s idea, we conditioned on video's height, width, aspect ratio, frame length, and fps.
+- **Extending T5 tokens from 120 to 200**: our caption is usually less than 200 tokens, and we find the model can handle longer text well.
+
+## Support for Multi-time/resolution/aspect ratio/fps Training
+
+As mentioned in the [sora's report](https://openai.com/research/video-generation-models-as-world-simulators), training with original video's resolution, aspect ratio, and length increase sampling flexibility and improve framing and composition. We found three ways to achieve this goal:
+
+- [NaViT](https://arxiv.org/abs/2307.06304): support dynamic size within the same batch by masking, with little efficiency loss. However, the system is a bit complex to implement, and may not benefit from optimized kernels such as flash attention.
+- Padding ([FiT](https://arxiv.org/abs/2402.12376), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)): support dynamic size within the same batch by padding. However, padding different resolutions to the same size is not efficient.
+- Bucket ([SDXL](https://arxiv.org/abs/2307.01952), [PixArt](https://arxiv.org/abs/2310.00426)): support dynamic size in different batches by bucketing, but the size must be the same within the same batch, and only a fixed number of size can be applied. With the same size in a batch, we do not need to implement complex masking or padding.
+
+For the simplicity of implementation, we choose the bucket method. We pre-define some fixed resolution, and allocate different samples to different bucket. The concern for bucketing is listed below. But we can see that the concern is not a big issue in our case.
+
+<details>
+<summary>View the concerns</summary>
+
+- The bucket size is limited to a fixed number: First, in real-world applications, only a few aspect ratios (9:16, 3:4) and resolutions (240p, 1080p) are commonly used. Second, we find trained models can generalize well to unseen resolutions.
+- The size in each batch is the same, breaks the i.i.d. assumption: Since we are using multiple GPUs, the local batches on different GPUs have different sizes. We did not see a significant performance drop due to this issue.
+- The may not be enough samples to fill each bucket and the distribution may be biased: First, our dataset is large enough to fill each bucket when local batch size is not too large. Second, we should analyze the data's distribution on sizes and define the bucket size accordingly. Third, an unbalanced distribution did not affect the training process significantly.
+- Different resolutions and frame lengths may have different processing speed: Different from PixArt, which only deals with aspect ratios of similar resolutions (similar token numbers), we need to consider the processing speed of different resolutions and frame lengths. We can use the `bucket_config` to define the batch size for each bucket to ensure the processing speed is similar.
+
+</details>
+
+![bucket](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_bucket.png)
+
+As shown in the figure, a bucket is a triplet of `(resolution, num_frame, aspect_ratio)`. We provide pre-defined aspect ratios for different resolution that covers most of the common video aspect ratios. Before each epoch, we shuffle the dataset and allocate the samples to different buckets as shown in the figure. We put a sample into a bucket with largest resolution and frame length that is smaller than the video's.
+
+Considering our computational resource is limited, we further introduce two attributes `keep_prob` and `batch_size` for each `(resolution, num_frame)` to reduce the computational cost and enable multi-stage training. Specifically, a high-resolution video will be downsampled to a lower resolution with probability `1-keep_prob` and the batch size for each bucket is `batch_size`. In this way, we can control the number of samples in different buckets and balance the GPU load by search a good batch size for each bucket.
+
+A detailed explanation of the bucket usage in training is available in [docs/config.md](/docs/config.md#training-bucket-configs).
+
+## Masked DiT as Image/Video-to-Video Model
+
+Transformers can be easily extended to support image-to-image and video-to-video tasks. We propose a mask strategy to support image and video conditioning. The mask strategy is shown in the figure below.
+
+![mask strategy](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask.png)
+
+Typically, we unmask the frames to be conditioned on for image/video-to-video condition. During the ST-DiT forward, unmasked frames will have timestep 0, while others remain the same (t). We find directly apply the strategy to trained model yield poor results as the diffusion model did not learn to handle different timesteps in one sample during training.
+
+Inspired by [UL2](https://arxiv.org/abs/2205.05131), we introduce random mask strategy during training. Specifically, we randomly unmask the frames during training, including unmask the first frame, the first k frames, the last frame, the last k frames, the first and last k frames, random frames, etc. Based on Open-Sora 1.0, with 50% probability of applying masking, we see the model can learn to handle image conditioning (while 30% yields worse ability) for 10k steps, with a little text-to-video performance drop. Thus, for Open-Sora 1.1, we pretrain the model from scratch with masking strategy.
+
+An illustration of masking strategy config to use in inference is given as follow. A five number tuple provides great flexibility in defining the mask strategy. By conditioning on generated frames, we can autogressively generate infinite frames (although error propagates).
+
+![mask strategy config](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask_config.png)
+
+A detailed explanation of the mask strategy usage is available in [docs/config.md](/docs/config.md#advanced-inference-config).
+
+## Data Collection & Pipeline
+
+As we found in Open-Sora 1.0, the data number and quality are crucial for training a good model, we work hard on scaling the dataset. First, we create an automatic pipeline following [SVD](https://arxiv.org/abs/2311.15127), inlcuding scene cutting, captioning, various scoring and filtering, and dataset management scripts and conventions. More infomation can be found in [docs/data_processing.md](/docs/data_processing.md).
+
+![pipeline](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_data_pipeline.png)
+
+We plan to use [panda-70M](https://snap-research.github.io/Panda-70M/) and other data to traing the model, which is approximately 30M+ data. However, we find disk IO a botteleneck for training and data processing at the same time. Thus, we can only prepare a 10M dataset and did not go through all processing pipeline that we built. Finally, we use a dataset with 9.7M videos + 2.6M images for pre-training, and 560k videos + 1.6M images for fine-tuning. The pretraining dataset statistics are shown below. More information about the dataset can be found in [docs/datasets.md](/docs/datasets.md).
+
+Image text tokens (by T5 tokenizer):
+
+![image text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_image_textlen.png)
+
+Video text tokens (by T5 tokenizer). We directly use panda's short caption for training, and caption other datasets by ourselves. The generated caption is usually less than 200 tokens.
+
+![video text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_textlen.png)
+
+Video duration:
+
+![video duration](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_duration.png)
+
+## Training Details
+
+With limited computational resources, we have to carefully monitor the training process, and change the training strategy if we speculate the model is not learning well since there is no computation for ablation study. Thus, Open-Sora 1.1's training includes multiple changes, and as a result, ema is not applied.
+
+1. First, we fine-tune **6k** steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
+2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for **24k** steps, which takes **4 days** on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in **240p** resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py). The video looking is good, but the model does not know much about the temporal knowledge. We use mask ratio of 10%.
+3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained **40k** steps for **2 days**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). We use a lower resolution as we find in Open-Sora 1.0 that the model can learn temporal knowledge with relatively low resolution.
+4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. Similar to SD3, we find the model quickly adapt to the QK-normalization. We also switch iddpm-speed to iddpm, and increase the mask ratio to 25% as we find image-condition not learning well. We trained for **17k** steps for **14 hours**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step **81k**.
+5. **[Stage 2]** We switch to a higher resolution, where most videos are in **240p and 480p** resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained **22k** steps for **one day** on all pre-training data.
+6. **[Stage 3]** We switch to a higher resolution, where most videos are in **480p and 720p** resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained **4k** with **one day** on high-quality data. We find loading previous stage's optimizer state can help the model learn faster.
+
+To summarize, the training of Open-Sora 1.1 requires approximately **9 days** on 64 H800 GPUs.
+
+## Limitation and Future Work
+
+As we get one step closer to the replication of Sora, we find many limitations for the current model, and these limitations point to the future work.
+
+- **Generation Failure**: we find many cases (especially when the total token number is large or the content is complex),  our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it. Besides, we will increase our model size and training data to improve the generation quality in the next version.
+- **Noisy generation and influency**: we find the generated model is sometimes noisy and not fluent, especially for long videos. We think the problem is due to not using a temporal VAE. As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we plan to develop a temporal VAE for the model in the next version.
+- **Lack of time consistency**: we find the model cannot generate videos with high time consistency. We think the problem is due to the lack of training FLOPs. We plan to collect more data and continue training the model to improve the time consistency.
+- **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.
+- **Low aesthetic score**: we find the model's aesthetic score is not high. The problem is due to the lack of aesthetic score filtering, which is not conducted due to IO bottleneck. We plan to filter the data by aesthetic score and finetuning the model to improve the aesthetic score.
+- **Worse quality for longer video generation**: we find with a same prompt, the longer video has worse quality. This means the image quality is not equally adapted to different lengths of sequences.
+
+> - **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou, Tianyi Li
+> - **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu
diff --git a/docs/report_03.md b/docs/report_03.md
new file mode 100644
index 0000000..6af786b
--- /dev/null
+++ b/docs/report_03.md
@@ -0,0 +1,160 @@
+# Open-Sora 1.2 Report
+
+- [Video compression network](#video-compression-network)
+- [Rectified flow and model adaptation](#rectified-flow-and-model-adaptation)
+- [More data and better multi-stage training](#more-data-and-better-multi-stage-training)
+- [Easy and effective model conditioning](#easy-and-effective-model-conditioning)
+- [Evaluation](#evaluation)
+- [Sequence parallelism](#sequence-parallelism)
+
+In Open-Sora 1.2 release, we train a 1.1B models on >30M data (\~80k hours), with training cost 35k H100 GPU hours, supporting 0s\~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
+
+|      | image | 2s  | 4s  | 8s  | 16s |
+| ---- | ----- | --- | --- | --- | --- |
+| 240p | ✅     | ✅   | ✅   | ✅   | ✅   |
+| 360p | ✅     | ✅   | ✅   | ✅   | ✅   |
+| 480p | ✅     | ✅   | ✅   | ✅   | 🆗   |
+| 720p | ✅     | ✅   | ✅   | 🆗   | 🆗   |
+
+Here ✅ means that the data is seen during training, and 🆗 means although not trained, the model can inference at that config. Inference for 🆗 requires more than one 80G memory GPU and sequence parallelism.
+
+Besides features introduced in Open-Sora 1.1, Open-Sora 1.2 highlights:
+
+- Video compression network
+- Rectifie-flow training
+- More data and better multi-stage training
+- Easy and effective model conditioning
+- Better evaluation metrics
+
+All implementations (both training and inference) of the above improvements are available in the Open-Sora 1.2 release. The following sections will introduce the details of the improvements. We also refine our codebase and documentation to make it easier to use and develop, and add a LLM to [refine input prompts](/README.md#gpt-4o-prompt-refinement) and support more languages.
+
+## Video compression network
+
+For Open-Sora 1.0 & 1.1, we used stability-ai's 83M 2D VAE, which compress the video only in the spatial dimension by 8x8 times. To reduce the temporal dimension, we extracted one frame in every three frames. However, this method led to the low fluency of generated video as the generated fps is sacrificed. Thus, in this release, we introduce the video compression network as OpenAI's Sora does. With a 4 times compression in the temporal dimension, we do not need to extract frames and can generate videos with the original fps.
+
+Considering the high computational cost of training a 3D VAE, we hope to re-use the knowledge learnt in the 2D VAE. We notice that after 2D VAE's compression, the features adjacent in the temporal dimension are still highly correlated. Thus, we propose a simple video compression network, which first compress the video in the spatial dimension by 8x8 times, then compress the video in the temporal dimension by 4x times. The network is shown below:
+
+![video_compression_network](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_3d_vae.png)
+
+We initialize the 2D VAE with [SDXL's VAE](https://huggingface.co/stabilityai/sdxl-vae), which is better than our previously used one. For the 3D VAE, we adopt the structure of VAE in [Magvit-v2](https://magvit.cs.cmu.edu/v2/), which contains 300M parameters. Along with 83M 2D VAE, the total parameters of the video compression network is 384M. We train the 3D VAE for 1.2M steps with local batch size 1. The training data is videos from pixels and pixabay, and the training video size is mainly 17 frames, 256x256 resolution. Causal convolutions are used in the 3D VAE to make the image reconstruction more accurate.
+
+Our training involves three stages:
+
+1. For the first 380k steps, we train on 8 GPUs and freeze the 2D VAE. The training objective includes the reconstruction of the compressed features from 2D VAE (pink one in the figure) and also add a loss to make features from the 3D VAE similar to the features from the 2D VAE (pink one and green one, called identity loss). We find the latter loss can quickly make the whole VAE achieve a good performance for image and much faster to converge in the next stage.
+2. For the next 260k steps, We remove the identity loss and just learn the 3D VAE.
+3. For the last 540k steps , since we find only reconstruction 2D VAE's feature cannot lead to further improvement, we remove the loss and train the whole VAE to reconstruct the original videos. This stage is trained on on 24 GPUs.
+
+For both stage 1 and stage 2 training, we adopt 20% images and 80% videos. Following [Magvit-v2](https://magvit.cs.cmu.edu/v2/), we train video using 17 frames, while zero-padding the first 16 frames for image. However, we find that this setting leads to blurring of videos with length different from 17 frames. Thus, in stage 3, we use a random number within 34 frames for mixed video length training (a.k.a., zero-pad the first  `43-n` frames if we want to train a `n` frame video), to make our VAE more robust to different video lengths. Our [training](/scripts/train_vae.py) and [inference](/scripts/inference_vae.py) code is available in the Open-Sora 1.2 release.
+
+When using the VAE for diffusion model, our stacked VAE requires small memory as the our VAE's input is already compressed. We also split the input videos input several 17 frames clips to make the inference more efficient.  The performance of our VAE is on par with another open-sourced 3D VAE in [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md).
+
+| Model              | SSIM↑ | PSNR↑  |
+| ------------------ | ----- | ------ |
+| Open-Sora-Plan 1.1 | 0.882 | 29.890 |
+| Open-Sora 1.2      | 0.880 | 30.590 |
+
+## Rectified flow and model adaptation
+
+Lastest diffusion model like Stable Diffusion 3 adopts the [rectified flow](https://github.com/gnobitab/RectifiedFlow) instead of DDPM for better performance. Pitiably, SD3's rectified flow training code is not open-sourced. However, Open-Sora 1.2 provides the training code following SD3's paper, including:
+
+- Basic rectified flow training ([original rectified flow paper](https://arxiv.org/abs/2209.03003))
+- Logit-norm sampling for training acceleration ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 3.1, intuitively it is more likely to sample timesteps at middle noise level)
+- Resolution and video length aware timestep sampling ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 5.3.2, intuitively it is more likely to sample timesteps with more noise for larger resolution, and we extend it to longer video)
+
+For the resolution-aware timestep sampling, we should use more noise for images with larger resolution. We extend this idea to video generation and use more noise for videos with longer length.
+
+Open-Sora 1.2 starts from the [PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) checkpoint. Note that this model is trained with DDPM and SDXL VAE, also a much higher resolution. We find finetuning on a small dataset can easily adapt the model for our video generation setting. The adaptation process is as follows, all training is done on 8 GPUs (the adaptation for the diffusion model is quite fast and straightforward):
+
+1. Multi-resolution image generation ability: we train the model to generate different resolution ranging from 144p to 2K for 20k steps.
+2. QK-norm: we add the QK-norm to the model and train for 18k steps.
+3. Rectified flow: we transform from discrete-time DDPM to continuous-time rectified flow and train for 10k steps.
+4. Rectified flow with logit-norm sampling and resolution-aware timestep sampling: we train for 33k steps.
+5. Smaller AdamW epsilon: following SD3, with QK-norm, we can use a smaller epsilon (1e-15) for AdamW, we train for 8k steps.
+6. New VAE and fps conditioning: we replace the original VAE with ours and add fps conditioning to the timestep conditioning, we train for 25k steps. Note that normalizing each channel is important for rectified flow training.
+7. Temporal attention blocks: we add temporal attention blocks with zero initialized projection layers. We train on images for 3k steps.
+8. Temporal blocks only for video with mask strategy: we train the temporal attention blocks only on videos for 38k steps.
+
+After the above adaptation, we are ready to train the model on videos. The adaptation above maintains the original model's ability to generate high-quality images, and brings multiple benefits for video generation:
+
+- With rectified flow, we can accelerate the training and reduce the number of sampling steps for video from 100 to 30, which greatly reduces the waiting time for inference.
+- With qk-norm, the training is more stablized and an aggressive optimizer can be used.
+- With new VAE, the temporal dimension is compressed by 4 times, which makes the training more efficient.
+- With multi-resolution image generation ability, the model can generate videos with different resolutions.
+
+## More data and better multi-stage training
+
+Due to a limited computational budget, we carefully arrange the training data from low to high quality and split our training into three stages. Our training involves 12x8 GPUs, and the total training time is about 2 weeks for about 70k steps.
+
+### First stage
+
+We first train the model on Webvid-10M datasets (40k hours) for 30k steps (2 epochs). Since the video is all lower than 360p resolution and contains watermark, we train on this dataset first. The training mainly happens on 240p and 360p, with video length 2s~16s. We use the original caption in the dataset for training. The training config locates in [stage1.py](/configs/opensora-v1-2/train/stage1.py).
+
+### Second stage
+
+Then we train the model on Panda-70M datasets. This dataset is large but the quality varies. We use the official 30M subset which clips are more diverse, and filter out videos with aesthetic score lower than 4.5. This leads to a 20M subset with 41k hours. The captions in the dataset are directly used for our training. The training config locates in [stage2.py](/configs/opensora-v1-2/train/stage2.py).
+
+The training mainly happens on 360p and 480p. We train the model for 23k steps, which is 0.5 epoch. The training is not fully done since we hope our new model can meet you earlier.
+
+### Third stage
+
+In this stage, we collect ~2M video clips with a total length of 5K hours from all kinds of sources, including:
+
+- Free-license videos, sourced from Pexels, Pixabay, Mixkit, etc.
+- [MiraData](https://github.com/mira-space/MiraData): a high-quality dataset with long videos, mainly from games and city/scenic exploration.
+- [Vript](https://github.com/mutonix/Vript/tree/main): a densely annotated dataset.
+- And some other datasets.
+
+While MiraData and Vript have captions from GPT, we use [PLLaVA](https://github.com/magic-research/PLLaVA) to caption the rest ones. Compared with LLaVA, which is only capable of single frame/image captioning, PLLaVA is specially designed and trained for video captioning. The [accelerated PLLaVA](/tools/caption/README.md#pllava-captioning) is released in our `tools/`. In practice, we use the pretrained PLLaVA 13B model and select 4 frames from each video for captioning with a spatial pooling shape of 2*2.
+
+Some statistics of the video data used in this stage are shown below. We present basic statistics of duration and resolution, as well as aesthetic score and optical flow score distribution.
+We also extract tags for objects and actions from video captions and count their frequencies.
+![stats](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_video_stats.png)
+![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_objects_count.png)
+![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_actions_count.png)
+
+We mainly train 720p and 1080p videos in this stage, aiming to extend the model's ability to larger resolutions. We use a mask ratio of 25% during training. The training config locates in [stage3.py](/configs/opensora-v1-2/train/stage3.py). We train the model for 15k steps, which is approximately 2 epochs.
+
+## Easy and effective model conditioning
+
+For stage 3, we calculate the aesthetic score and motion score for each video clip. However, since the number of video clips is small, we are not willing to filter out clips with low scores, which leads to a smaller dataset. Instead, we append the scores to the captions and use them as conditioning. We find this method can make model aware of the scores and follows the scores to generate videos with better quality.
+
+For example, a video with aesthetic score 5.5, motion score 10, and a detected camera motion pan left, the caption will be:
+
+```plaintext
+[Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left.
+```
+
+During inference, we can also use the scores to condition the model. For camera motion, we only label 13k clips with high confidence, and the camera motion detection module is released in our tools.
+
+## Evaluation
+
+Previously, we monitor the training process only by human evaluation, as DDPM traning loss is not well correlated with the quality of generated videos. However, for rectified flow, we find the training loss is well correlated with the quality of generated videos as stated in SD3. Thus, we keep track of rectified flow evaluation loss on 100 images and 1k videos.
+
+We sampled 1k videos from pixabay as validation dataset. We calculate the evaluation loss for image and different lengths of videos (2s, 4s, 8s, 16s) for different resolution (144p, 240p, 360p, 480p, 720p). For each setting, we equidistantly sample 10 timesteps. Then all the losses are averaged. We also provide a [video](https://streamable.com/oqkkf1) showing the sampled videos with a fixed prompt for different steps.
+
+![Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_val_loss.png)
+![Video Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vid_val_loss.png)
+
+In addition, we also keep track of [VBench](https://vchitect.github.io/VBench-project/) scores during training. VBench is an automatic video evaluation benchmark for short video generation. We calcuate the vbench score with 240p 2s videos. The two metrics verify that our model continues to improve during training.
+
+![VBench](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vbench_score.png)
+
+All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details.
+
+| Model          | Total Score | Quality Score | Semantic Score |
+| -------------- | ----------- | ------------- | -------------- |
+| Open-Sora V1.0 | 75.91%      | 78.81%        | 64.28%         |
+| Open-Sora V1.2 | 79.23%      | 80.71%        | 73.30%         |
+
+## Sequence parallelism
+
+We use sequence parallelism to support long-sequence training and inference. Our implementation is based on Ulysses and the workflow is shown below. When sequence parallelism is enabled, we only need to apply the `all-to-all` communication to the spatial block in STDiT as only spatial computation is dependent on the sequence dimension.
+
+![SP](..https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/sequence_parallelism.jpeg)
+
+Currently, we have not used sequence parallelism for training as data resolution is small and we plan to do so in the next release. As for inference, we can use sequence parallelism in case your GPU goes out of memory. A simple benchmark shows that sequence parallelism can achieve speedup
+
+| Resolution | Seconds | Number of GPUs | Enable SP | Time taken/s | Speedup per GPU |
+| ---------- | ------- | -------------- | --------- | ------------ | --------------- |
+| 720p       | 16s     | 1              | No        | 547.97       | -               |
+| 720p       | 16s     | 2              | Yes       | 244.38       | 12%             |
diff --git a/docs/report_04.md b/docs/report_04.md
new file mode 100644
index 0000000..4d025f1
--- /dev/null
+++ b/docs/report_04.md
@@ -0,0 +1,116 @@
+# Open-Sora 1.3 Report
+
+- [Video compression network](#video-compression-network)
+- [Upgraded STDiT with shifted-window attention](#upgraded-stdit-with-shifted-window-attention)
+- [Easy and effective model conditioning](#easy-and-effective-model-conditioning)
+- [Evaluation](#evaluation)
+
+In Open-Sora 1.3 release, we train a 1.1B models on >60M data (\~85k hours), with training cost 35k H100 GPU hours, supporting 0s\~113 frames, 360p & 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.2 version, Open-Sora 1.3 can also do image-to-video generation and video extension.
+
+|      | image | 49 frames  | 65 frames  | 81 frames  | 97 frames | 113 frames |
+| ---- | ----- | ---------- | ---------- | ---------- | --------- | ---------- |
+| 360p | ✅     | ✅         | ✅         | ✅         | ✅         |✅          |
+| 720p | ✅     | ✅         | ✅         | ✅         | ✅         |✅          |
+
+Here ✅ means that the data is seen during training.
+
+Besides features introduced in Open-Sora 1.2, Open-Sora 1.3 highlights:
+
+- Video compression network
+- Upgraded STDiT with shifted-window attention
+- More data and better multi-stage training
+- Easy and effective model conditioning
+- Better evaluation metrics
+
+All implementations (both training and inference) of the above improvements are available in the Open-Sora 1.3 release. The following sections will introduce the details of the improvements. We also refine our codebase and documentation to make it easier to use and develop, and add a LLM refiner to [refine input prompts](/README.md#gpt-4o-prompt-refinement) and support more languages.
+
+## Video compression network
+
+In Open-Sora 1.2, the video compression architecture employed a modular approach, where spatial and temporal dimensions were handled separately. The spatial VAE, based on Stability AI's SDXL VAE, compressed individual frames along the spatial dimensions. The temporal VAE then processed the latent representations from the spatial VAE to handle temporal compression. This two-stage design allowed effective spatial and temporal compression but introduced limitations. These included inefficiencies in handling long videos due to fixed-length input frames, a lack of seamless integration between spatial and temporal features, and higher memory requirements during both training and inference.
+
+Open-Sora 1.3 introduces a unified approach to video compression. By combining spatial and temporal processing into a single framework and leveraging advanced features like tiled 3D convolutions and dynamic frame support, Open-Sora 1.3 achieves improved better efficiency, scalability, and reconstruction quality. Here are the key improvements in Open-Sora 1.3 VAE:
+
+**1. Unified Spatial-Temporal Processing:** Instead of using separate VAEs for spatial and temporal compression, Open-Sora 1.3 adopts a single encoder-decoder structure that simultaneously handles both dimensions. This approach eliminates the need for intermediate representations and redundant data transfers between spatial and temporal modules.
+
+**2. Tiled 3D Convolutions:** Open-Sora 1.3 incorporates tiled 3D convolution support for the temporal dimension. By breaking down videos into smaller temporal tiles, this feature enables efficient encoding and decoding of longer video sequences without increasing memory overhead. This improvement addresses the limitations of Open-Sora 1.2 in handling large frame counts and ensures higher flexibility in temporal compression.
+
+**3. Dynamic Micro-Batch and Micro-Frame Processing:** Open-Sora 1.3 introduces a new micro-batch and micro-frame processing mechanism. This allows for: (1) Adaptive temporal overlap: Overlapping frames during temporal encoding and decoding help reduce discontinuities at tile boundaries. (2) Dynamic frame size support: Instead of being restricted to fixed-length sequences (e.g., 17 frames in Open-Sora 1.2), Open-Sora 1.3 supports dynamic sequence lengths, making it robust for varied video lengths.
+
+**4. Unified Normalization Mechanism:** The normalization process in Open-Sora 1.3 has been refined with tunable scaling (scale) and shifting (shift) parameters that ensure consistent latent space distributions across diverse datasets. Unlike Open-Sora 1.2, where normalization was specific to fixed datasets, this version introduces more generalized parameters and support for frame-specific normalization strategies.
+
+
+#### Summary of Improvements
+
+| Feature                | Open-Sora 1.2                          | Open-Sora 1.3                          |
+|------------------------|-----------------------------------------|-----------------------------------------|
+| **Architecture**       | Separate spatial and temporal VAEs      | Unified spatial-temporal VAE            |
+| **Tiled Processing**   | Not supported                          | Supported (Tiled 3D Convolutions)       |
+| **Frame Length Support**| Fixed (17 frames)                      | Dynamic frame support with overlap      |
+| **Normalization**      | Fixed parameters                       | Tunable scaling and shifting            |
+
+
+## Upgraded STDiT with shifted-window attention
+
+Following the success of OpenSora 1.2, version 1.3 introduces several architectural improvements and new capabilities to enhance video generation quality and flexibility. This section outlines the key improvements and differences between these two versions.
+
+Latest diffusion models like Stable Diffusion 3 adopt the [rectified flow](https://github.com/gnobitab/RectifiedFlow) instead of DDPM for better performance. While SD3's rectified flow training code is not open-sourced, OpenSora provides the training code following SD3's paper. OpenSora 1.2 introduced several key strategies from SD3:
+
+1. Basic rectified flow training, which enables continuous-time diffusion
+2. Logit-norm sampling for training acceleration (following SD3 paper Section 3.1), preferentially sampling timesteps at middle noise levels
+3. Resolution and video length aware timestep sampling (following SD3 paper Section 5.3.2), using more noise for larger resolutions and longer videos
+
+For OpenSora 1.3, we further enhance the model with significant improvements in architecture, capabilities, and performance:
+
+#### 1. Shift-Window Attention Mechanism
+- Introduced kernel-based local attention with configurable kernel_size for efficient computation
+- Implemented shift-window partitioning strategy similar to Swin Transformer
+- Added padding mask handling for window boundaries with extra_pad_on_dims support
+- Extended position encoding with 3D relative positions within local windows (temporal, height, width)
+#### 2. Enhanced Position Encoding
+- Improved RoPE implementation with reduced rotation_dim (1/3 of original) for 3D scenarios
+- Added separate rotary embeddings for temporal, height, and width dimensions
+- Implemented resolution-adaptive scaling for position encodings
+- Optional spatial RoPE for better spatial relationship modeling
+#### 3. Flexible Generation
+- Added I2V and V2V capabilities with dedicated conditioning mechanisms
+- Introduced conditional embedding modules (x_embedder_cond and x_embedder_cond_mask)
+- Zero-initialized condition embeddings for stable training
+- Flexible temporal modeling with skip_temporal option
+#### 4. Performance Optimization
+- Refined Flash Attention triggering conditions (N > 128) for better efficiency
+- Added support for torch.scaled_dot_product_attention (SDPA) as an alternative backend
+- Optimized memory usage through improved padding and window partitioning
+- Enhanced sequence parallelism with adaptive height padding
+
+The adaptation process from [PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) remains similar but with additional steps:
+1-7. [Same as v1.2: multi-resolution training, QK-norm, rectified flow, logit-norm sampling, smaller AdamW epsilon, new VAE, and basic temporal attention]
+#### 8. Enhanced temporal blocks
+   - Added kernel-based local attention with shift-window support
+   - Implemented 3D relative position encoding with resolution-adaptive scaling
+   - Zero-initialized projection layers with improved initialization strategy
+
+Compared to v1.2 which focused on basic video generation, v1.3 brings substantial improvements in three key areas: **1. Quality**: Enhanced spatial-temporal modeling through shift-window attention and 3D position encoding. **2. Flexibility**: Support for I2V/V2V tasks and configurable temporal modeling. **3. Efficiency**: Optimized attention computation and memory usage
+
+These improvements maintain backward compatibility with v1.2's core features while extending the model's capabilities for real-world applications. The model retains its ability to generate high-quality images and videos using rectified flow, while gaining new strengths in conditional generation and long sequence modeling.
+
+## Easy and effective model conditioning
+
+We calculate the aesthetic score and motion score for each video clip, and filter out those clips with low scores, which leads to a dataset with better video quality. Additionally, we append the scores to the captions and use them as conditioning. Specifically, we convert numerical scores into descriptive language based on predefined ranges. The aesthetic score transformation function converts numerical aesthetic scores into descriptive labels based on predefined ranges: scores below 4 are labeled "terrible," progressing through "very poor," "poor," "fair," "good," and "very good," with scores of 6.5 or higher labeled as "excellent." Similarly, the motion score transformation function maps motion intensity scores to descriptors: scores below 0.5 are labeled "very low," progressing through "low," "fair," "high," and "very high," with scores of 20 or more labeled as "extremely high." We find this method can make model aware of the scores and follows the scores to generate videos with better quality.
+
+For example, a video with aesthetic score 5.5, motion score 10, and a detected camera motion pan left, the caption will be:
+
+```plaintext
+[Original Caption] The aesthetic score is good, the motion strength is high, camera motion: pan left.
+```
+
+During inference, we can also use the scores to condition the model. For camera motion, we only label 13k clips with high confidence, and the camera motion detection module is released in our tools.
+
+## Evaluation
+
+Previously, we monitor the training process only by human evaluation, as DDPM traning loss is not well correlated with the quality of generated videos. However, for rectified flow, we find the training loss is well correlated with the quality of generated videos as stated in SD3. Thus, we keep track of rectified flow evaluation loss on 100 images and 1k videos.
+
+We sampled 1k videos from pixabay as validation dataset. We calculate the evaluation loss for image and different lengths of videos (49 frames, 65 frames, 81 frames, 97 frames, 113 frames) for different resolution (360p, 720p). For each setting, we equidistantly sample 10 timesteps. Then all the losses are averaged.
+
+In addition, we also keep track of [VBench](https://vchitect.github.io/VBench-project/) scores during training. VBench is an automatic video evaluation benchmark for short video generation. We calcuate the vbench score with 360p 49-frame videos. The two metrics verify that our model continues to improve during training.
+
+All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details.
diff --git a/docs/train.md b/docs/train.md
new file mode 100644
index 0000000..8eb487f
--- /dev/null
+++ b/docs/train.md
@@ -0,0 +1,201 @@
+# Step by step to train or finetune your own model
+
+## Installation
+
+Besides from the installation in the main page, you need to install the following packages:
+
+```bash
+pip install git+https://github.com/hpcaitech/TensorNVMe.git # requires cmake, for checkpoint saving
+pip install pandarallel # for parallel processing
+```
+
+## Prepare dataset
+
+The dataset should be presented in a `csv` or `parquet` file. To better illustrate the process, we will use a 45k [pexels dataset](https://huggingface.co/datasets/hpcai-tech/open-sora-pexels-45k) as an example. This dataset contains clipped, score filtered high-quality videos from [Pexels](https://www.pexels.com/).
+
+First, download the dataset to your local machine:
+
+```bash
+mkdir datasets
+cd datasets
+# For Chinese users, export HF_ENDPOINT=https://hf-mirror.com to speed up the download
+huggingface-cli download --repo-type dataset hpcai-tech/open-sora-pexels-45k --local-dir open-sora-pexels-45k # 250GB
+
+cd open-sora-pexels-45k
+cat tar/pexels_45k.tar.* > pexels_45k.tar
+tar -xvf pexels_45k.tar
+mv pexels_45k .. # make sure the path is Open-Sora/datasets/pexels_45k
+```
+
+There are three `csv` files provided:
+
+- `pexels_45k.csv`: contains only path and text, which needs to be processed for training.
+- `pexels_45k_necessary.csv`: contains necessary information for training.
+- `pexels_45k_score.csv`: contains score information for each video. The 45k videos are filtered out based on the score. See tech report for more details.
+
+If you want to use custom dataset, at least the following columns are required:
+
+```csv
+path,text,num_frames,height,width,aspect_ratio,resolution,fps
+```
+
+We provide a script to process the `pexels_45k.csv` to `pexels_45k_necessary.csv`:
+
+```bash
+# single process
+python scripts/cnv/meta.py --input datasets/pexels_45k.csv --output datasets/pexels_45k_nec.csv --num_workers 0
+# parallel process
+python scripts/cnv/meta.py --input datasets/pexels_45k.csv --output datasets/pexels_45k_nec.csv --num_workers 64
+```
+
+> The process may take a while, depending on the number of videos in the dataset. The process is neccessary for training on arbitrary aspect ratio, resolution, and number of frames.
+
+## Training
+
+The command format to launch training is as follows:
+
+```bash
+torchrun --nproc_per_node 8 scripts/diffusion/train.py [path/to/config] --dataset.data-path [path/to/dataset] [override options]
+```
+
+For example, to train a model with stage 1 config from scratch using pexels dataset:
+
+```bash
+torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv
+```
+
+### Config
+
+All configs are located in `configs/diffusion/train/`. The following rules are applied:
+
+- `_base_ = ["config_to_inherit"]`: inherit from another config by mmengine's support. Variables are overwritten by the new config. Dictionary is merged if `_delete_` key is not present.
+- command line arguments override the config file. For example, `--lr 1e-5` will override the `lr` in the config file. `--dataset.data-path datasets/pexels_45k_necessary.csv` will override the `data-path` value in the dictionary `dataset`.
+
+The `bucket_config` is used to control different training stages. It is a dictionary of dictionaries. The tuple means (sampling probability, batch size). For example:
+
+```python
+bucket_config = {
+    "256px": {
+        1: (1.0, 45), # for 256px images, use 100% of the data with batch size 45
+        33: (1.0, 12), # for 256px videos with no less than 33 frames, use 100% of the data with batch size 12
+        65: (1.0, 6), # for 256px videos with no less than 65 frames, use 100% of the data with batch size 6
+        97: (1.0, 4), # for 256px videos with no less than 97 frames, use 100% of the data with batch size 4
+        129: (1.0, 3), # for 256px videos with no less than 129 frames, use 100% of the data with batch size 3
+    },
+    "768px": {
+        1: (0.5, 13), # for 768px images, use 50% of the data with batch size 13
+    },
+    "1024px": {
+        1: (0.5, 7), # for 1024px images, use 50% of the data with batch size 7
+    },
+}
+```
+
+We provide the following configs, the batch size is searched on H200 GPUs with 140GB memory:
+
+- `image.py`: train on images only.
+- `stage1.py`: train on videos with 256px resolution.
+- `stage2.py`: train on videos with 768px resolution with sequence parallelism (default 4).
+- `stage1_i2v.py`: train t2v and i2v with 256px resolution.
+- `stage2_i2v.py`: train t2v and i2v with 768px resolution.
+
+We also provide a demo config `demo.py` with small batch size for debugging.
+
+### Fine-tuning
+
+To finetune from Open-Sora v2, run:
+
+```bash
+torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --model.from_pretrained ckpts/Open_Sora_v2.safetensors
+```
+
+To finetune from flux-dev, we provided a transformed flux-dev [ckpts](https://huggingface.co/hpcai-tech/flux1-dev-fused-rope). Download it to `ckpts` and run:
+
+```bash
+torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --model.from_pretrained ckpts/flux1-dev-fused-rope.safetensors
+```
+
+### Multi-GPU
+
+To train on multiple GPUs, use `colossalai run`:
+
+```bash
+colossalai run --hostfile hostfiles --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --model.from_pretrained ckpts/Open_Sora_v2.safetensors
+```
+
+`hostfiles` is a file that contains the IP addresses of the nodes. For example:
+
+```bash
+xxx.xxx.xxx.xxx
+yyy.yyy.yyy.yyy
+zzz.zzz.zzz.zzz
+```
+
+use `--wandb True` to log the training process to [wandb](https://wandb.ai/).
+
+### Resume training
+
+To resume training, use `--load`. It will load the optimizer state and dataloader state.
+
+```bash
+torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --load outputs/your_experiment/epoch*-global_step*
+```
+
+If you want to load optimzer state but not dataloader state, use:
+
+```bash
+torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --load outputs/your_experiment/epoch*-global_step* --start-step 0 --start-epoch 0
+```
+
+> Note if dataset, batch size, and number of GPUs are changed, the dataloader state will not be meaningful.
+
+## Inference
+
+The inference is the same as described in the main page. The command format is as follows:
+
+```bash
+torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --model.from_pretrained outputs/your_experiment/epoch*-global_step*
+```
+
+## Advanced Usage
+
+More details are provided in the tech report. If explanation for some techiques is needed, feel free to open an issue.
+
+- Tensor parallelism and sequence parallelism
+- Zero 2
+- Pin memory organization
+- Garbage collection organization
+- Data prefetching
+- Communication bucket optimization
+- Shardformer for T5
+
+### Gradient Checkpointing
+
+We support selective gradient checkpointing to save memory. The `grad_ckpt_setting` is a tuple, the first element is the number of dual layers to apply gradient checkpointing, the second element is the number of single layers to apply full gradient. A very large number will apply full gradient to all layers.
+
+```python
+grad_ckpt_setting = (100, 100)
+model = dict(
+    grad_ckpt_setting=grad_ckpt_setting,
+)
+```
+
+To further save memory, you can offload gradient checkpointing to CPU by:
+
+```python
+grad_ckpt_buffer_size = 25 * 1024**3 # 25GB
+```
+
+### Asynchronous Checkpoint Saving
+
+With `--async-io True`, the checkpoint will be saved asynchronously with the support of ColossalAI. This will save time for checkpoint saving.
+
+### Dataset
+
+With a very large dataset, the `csv` file or even `parquet` file may be too large to fit in memory. We provide a script to split the dataset into smaller chunks:
+
+```bash
+python scripts/cnv/shard.py /path/to/dataset.parquet
+```
+
+Then a folder with shards will be created. You can use the `--dataset.memory_efficient True` to load the dataset shard by shard.
diff --git a/docs/zh_CN/report_v1.md b/docs/zh_CN/report_v1.md
new file mode 100644
index 0000000..ac16b4c
--- /dev/null
+++ b/docs/zh_CN/report_v1.md
@@ -0,0 +1,49 @@
+# Open-Sora v1 技术报告
+
+OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而，它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”，我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。
+
+## 选择高效的架构
+
+为了降低计算成本，我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而，我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源，而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此，我们决定在我们第一个版本中使用2D VAE（来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)）。
+
+视频训练涉及大量的token。考虑到24fps的1分钟视频，我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍，我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此，我们使用时空注意力来降低成本，这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。
+
+如图中所示，在STDiT（ST代表时空）中，我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而，我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好，但我们在16x256x256视频上的实验表明，相同数量的迭代次数下，性能排名为：DiT（完整）> STDiT（顺序）> STDiT（并行）≈ Latte。因此，我们出于效率考虑选择了STDiT（顺序）。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。
+
+
+![Architecture Comparison](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch_comp.png)
+
+为了专注于视频生成，我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型，具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型，并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力，而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。
+
+![Architecture](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch.jpg)
+
+借鉴PixArt-α和Stable Video Diffusion的成功，我们还采用了渐进式训练策略：在366K预训练数据集上进行16x256x256的训练，然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入，这一策略极大地降低了计算成本。
+
+我们还尝试在DiT中使用3D patch嵌入器。然而，在时间维度上2倍下采样后，生成的视频质量较低。因此，我们将在下一版本中将下采样留给时间VAE。目前，我们在每3帧采样一次进行16帧训练，以及在每2帧采样一次进行64帧训练。
+
+
+## 数据是训练高质量模型的核心
+
+我们发现数据的数量和质量对生成视频的质量有很大的影响，甚至比模型架构和训练策略的影响还要大。目前，我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割（366K个视频片段）。这些视频的质量参差不齐，而且字幕也不够准确。因此，我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA，一个图像字幕模型，通过三个帧和一个设计好的提示来标记视频。有了设计好的提示，LLaVA能够生成高质量的字幕。
+
+![Caption](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_caption.png)
+
+由于我们更加注重数据质量，我们准备收集更多数据，并在下一版本中构建一个视频预处理流程。
+
+## 训练细节
+
+在有限的训练预算下，我们只进行了一些探索。我们发现学习率1e-4过大，因此将其降低到2e-5。在进行大批量训练时，我们发现`fp16`比`bf16`不太稳定，可能会导致生成失败。因此，我们在64x512x512的训练中切换到`bf16`。对于其他超参数，我们遵循了之前的研究工作。
+
+## 损失曲线
+
+16x256x256 预训练损失曲线
+
+![16x256x256 Pretraining Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_1.png)
+
+16x256x256 高质量训练损失曲线
+
+![16x256x256 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_2.png)
+
+16x512x512 高质量训练损失曲线
+
+![16x512x512 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_3.png)
diff --git a/docs/zh_CN/report_v2.md b/docs/zh_CN/report_v2.md
new file mode 100644
index 0000000..5e53a07
--- /dev/null
+++ b/docs/zh_CN/report_v2.md
@@ -0,0 +1,114 @@
+# Open-Sora 1.1 技术报告
+
+- [模型架构修改](#模型架构修改)
+- [支持不同视频长度/分辨率/宽高比/帧率（fps）训练](#支持不同视频长度分辨率宽高比帧率fps训练)
+- [使用Masked DiT作为图生视频/视频生视频模型](#使用masked-dit作为图生视频视频生视频模型)
+- [数据收集和流程](#数据收集和流程)
+- [训练详情](#训练详情)
+- [结果和评价](#结果和评价)
+- [不足和下一步计划](#不足和下一步计划)
+
+在Open-Sora1.1版本中，我们使用了10M数据来训练经过结构调优后的STDiT的700M模型（Open-Sora1.0版本仅用400K数据）。我们实现了[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的以下功能：
+
+- 可变的视频时长、分辨率、宽高比（包括采样灵活性、改进的取景范围和构图）
+- 提示词增加图片和视频选项（使图像动起来、生成式增长视频、视频到视频编辑、连接不同视频）
+- 图像生成功能
+
+为了实现这一目标，我们在预训练阶段使用了多任务学习。对于扩散模型来说，用不同的采样时间步长进行训练已经是一种多任务学习。我们将这一思想在图像和视频的条件生成模型上，进一步扩展到多分辨率、宽高比、帧长、fps以及不同的掩码策略。我们在**0~15s、144p到720p、各种宽高比的视频**上训练模型。虽然由于训练FLOPs不足的限制，生成的视频在时间一致性上的表现没有那么高，但我们仍然可以看到这个模型的巨大潜力。
+
+## 模型架构修改
+
+我们对原始ST-DiT模型进行了以下修改，以获得更好的训练稳定性和模型性能（ST-DiT-2）：
+
+- **在时间注意力模块中添加[旋转位置编码](https://arxiv.org/abs/2104.09864)**：遵循目前LLM的最佳实践，我们将时间注意力模块中的正弦位置编码更改为旋转位置编码，因为它也算一项序列预测任务。
+- **在时间注意力模块中添加AdaIN和Layernormal**：我们将时间注意力与AdaIN和Layer范数作为空间注意力包裹起来，以稳定训练。
+- **[QK归一化](https://arxiv.org/abs/2302.05442)与[RMSNorm](https://arxiv.org/abs/1910.07467)**：和[SD3](https://arxiv.org/pdf/2403.03206.pdf)类似地，我们应用QK归一化来提高半精度训练的稳定性。
+- **支持动态输入大小和视频条件限定**：为了支持多分辨率、宽高比和fps训练，我们ST-DiT-2来接受任何输入大小。延申[PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)的想法，我们支持限定视频的高度、宽度、宽高比、帧长和fps。
+- **将T5token数量从120扩展到200**：我们使用的视频描述通常少于200个token，我们发现模型也可以很好地处理更长的文本。
+
+## 支持不同视频长度/分辨率/宽高比/帧率（fps）训练
+
+正如[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的，使用原始无损视频的分辨率、宽高比和视频长度进行训练可以增加采样灵活性，改善取景和构图。我们找到了三种实现这一目标的方法：
+- [NaViT](https://arxiv.org/abs/2307.06304)：通过不同掩码策略支持在同一训练批次内使用不同大小的数据，并且训练效率下降很少。然而，该系统实现起来有点复杂，并且可能无法兼容kernal优化技术（如flashattention）。
+- 填充（[FiT](https://arxiv.org/abs/2402.12376)，[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)）：通过填充支持同一批次内的不同大小的数据。然而，将不同的分辨率填充到相同的大小会导致效率降低。
+- 分桶训练（[SDXL](https://arxiv.org/abs/2307.01952)、[PixArt](https://arxiv.org/abs/2310.00426)）：支持通过分桶的方式在不同批次中动态调整大小，但在同一批次内数据大小必须相同，只能应用固定数量的数据大小。在一个批次中，我们不需要实现复杂的掩码或填充。
+
+为了更便捷的实现，我们选择分桶训练的方式。我们预先定义了一些固定的分辨率，并将不同的样本分配到不同的桶中。下面列出了分桶方案中值得注意的点。但我们可以看到，这些在我们的实验中并不是一个大问题。
+
+<details>
+<summary>查看注意事项</summary>
+
+- 桶大小被限制为固定数量：首先，在实际应用中，通常只使用少数宽高比（9:16、3:4）和分辨率（240p、1080p）。其次，我们发现经过训练的模型可以很好地推广到未见过的解决方案。
+- 每批的大小相同，打破了独立同分布（i.i.d.）假设：由于我们使用多个 GPU，因此不同 GPU 上的本地批次具有不同的大小。我们没有发现此问题导致性能显着下降。
+- 可能没有足够的样本来填充每个桶，并且分布可能有偏差：首先，当本地批量大小不太大时，我们的数据集足够大以填充每个桶。其次，我们应该分析数据大小的分布并相应地定义桶大小。第三，分配不平衡并没有显着影响训练过程。
+- 不同的分辨率和帧长可能有不同的处理速度：与PixArt只处理相似分辨率（相似token数）的宽高比不同，我们需要考虑不同分辨率和帧长的处理速度。我们可以使用“bucket_config”来定义每个桶的批量大小，以确保处理速度相似。
+
+</details>
+
+![bucket](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_bucket.png)
+
+如图所示，桶是（分辨率，帧数量，宽高比）的三元组。我们为不同的分辨率提供预定义的宽高比，涵盖了大多数常见的视频宽高比。在每个epoch之前，我们打乱数据集并将样本分配到不同的桶中，如图所示。我们将样本放入最大分辨率和帧长度小于视频的桶中。
+
+考虑到我们的计算资源有限，我们进一步为每个（分辨率，num_frame）二元组引入keep_prob和batch_size两个属性，以降低计算成本并实现多阶段训练。具体来说，高清视频将以概率1-keep_prob下采样到较低分辨率的桶中，并且每个桶的样本数量是由batch_size属性决定的。这样，我们可以控制不同桶中的样本数量，并通过为每个桶搜索合适的数据量来平衡GPU负载。
+
+有关训练中桶使用的详细说明，请参阅[配置文件](/docs/config.md#training-bucket-configs).
+
+## 使用Masked DiT作为图生视频/视频生视频模型
+
+Transformer可以很容易地扩展到支持图生图和视频生视频的任务。我们提出了一种蒙版策略来支持图像和视频的调节。蒙版策略如下图所示。
+
+![mask strategy](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask.png)
+
+在将图像或视频转换成另一个视频的过程中，我们通常会选择出需要作为条件的帧并取消其掩码（unmask）。在使用ST-DiT模型进行前向传播时，被选择取消掩码（unmask）的帧将被赋予时间步长0，而其他帧则保持它们原有的时间步长t。我们发现，如果直接将这种策略应用到训练好的模型上，会得到较差的结果，因为扩散模型在训练过程中并未学会如何处理一个样本中具有不同时间步长的帧。
+
+受[UL2](https://arxiv.org/abs/2205.05131)的启发，我们在训练期间引入了随机掩码策略。具体来说，我们在训练期间随机取消掩码帧，包括取消掩码第一帧，前k帧，最后k帧，最后k帧，第一和最后k帧，随机帧等。基于Open-Sora 1.0模型，以50%的概率应用掩码策略，我们发现模型能够在10,000步的训练中学会处理图像条件（而30%的概率会导致处理能力变差），同时文本到视频的性能略有下降。因此，在Open-Sora 1.1版本中，我们从头开始预训练模型，并采用了掩码策略。
+
+下图给出了用于推理的掩码策略配置的说明。五数字元组在定义掩码策略方面提供了极大的灵活性。
+
+![mask strategy config](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask_config.png)
+
+掩码策略用法的详细说明可在[配置文件](/docs/config.md#advanced-inference-config)中查看.
+
+
+## 数据收集和流程
+
+正如我们在Sora1.0版本中看见的那样，数据数量和质量对于训练一个好的模型至关重要，因此，我们努力扩展数据集。首先，我们创建了一个遵循[SVD](https://arxiv.org/abs/2311.15127)的自动流水线，包括场景切割、字幕、各种评分和过滤以及数据集管理脚本和通用惯例。
+
+![pipeline](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_data_pipeline.png)
+
+我们计划使用[panda-70M](https://snap-research.github.io/Panda-70M/)和其他数据来训练模型，大约包含3000万条数据。然而，我们发现磁盘输入输出（disk IO）在同时进行训练和数据处理时成为了一个瓶颈。因此，我们只能准备一个包含1000万条数据的数据集，并且没有完成我们构建的所有处理流程。最终，我们使用了包含970万视频和260万图像的数据集进行预训练，以及560,000视频和160万图像的数据集进行微调。预训练数据集的统计信息如下所示。
+
+图像文本标记 (使用T5分词器)：
+![image text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_image_textlen.png)
+
+视频文本标记 (使用T5分词器)。我们直接使用Panda的短视频描述进行训练，并自己给其他数据集加视频描述。生成的字幕通常少于200个token。
+![video text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_textlen.png)
+
+视频时长：
+![video duration](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_duration.png)
+
+## 训练详情
+
+由于计算资源有限，我们必须仔细监控训练过程，并在推测模型学习不佳时更改训练策略，因为没有消融研究的计算。因此，Open-Sora1.1版本的训练包括多个更改，所以，指数移动平均（EMA）未被应用。
+
+1. 首先，我们从`Pixart-alpha-1024`的模型checkpoint开始，使用不同分辨率的图像进行了6000步的微调。我们发现模型能够很容易地适应并生成不同分辨率的图像。为了加快扩散过程的训练，我们使用了[SpeeDiT](https://github.com/1zeryu/SpeeDiT)（iddpm-speed）技术。
+2. **[阶段一]** 然后，我们使用梯度检查点（gradient-checkpointing）技术对模型进行了**24,000**步的预训练，这个过程在64个H800 GPU上运行了**4天**。尽管模型看到的数据样本数量相同，我们发现与使用较小批量大小相比，模型的学习速度较慢。我们推测，在训练的早期阶段，步数的数量对于训练更为重要。大多数视频的分辨率是**240p**，预训练时使用的配置与[stage2.py](/configs/opensora-v1-1/train/stage2.py)相似。
+3. **[阶段一]** 为了增加训练步数，我们改用了更小的批量大小，并且没有使用梯度检查点技术。在这个阶段，我们还引入了帧率（fps）条件。模型训练了**40,000**步，持续了**2天**。训练中使用的视频大多数是**144p**分辨率，使用的配置文件是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。我们使用较低的分辨率，因为我们在Open-Sora 1.0版本中发现模型可以以相对较低的分辨率学习时间知识。
+4. **[阶段一]** 我们发现模型不能很好地学习长视频，并在Open-Sora1.0训练中发现了一个噪声生成结果，推测是半精度问题。因此，我们采用QK-归一化来稳定训练。我们还将iddpm-speed切换成iddpm。我们训练了**17k**步**14小时**。大多数视频的分辨率是144p，预训练时使用的配置是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。阶段1训练持续约一周，总步长**81k**。
+5. **[阶段二]** 我们切换到更高的分辨率，其中大多数视频是**240p和480p**分辨率（[stage2.py](/configs/opensora-v1-1/train/stage2.py)）。我们在所有预训练数据上训练了**22000**步，持续**一天**。
+6. **[阶段三]** 我们切换到更高的分辨率，大多数视频的分辨率是**480p和720p**（[stage3.py](/configs/opensora-v1-1/train/stage3.py)）。我们在高质量数据上训了**4000**步，用时**一天**。
+
+## 结果和评价
+
+## 不足和下一步计划
+
+随着我们离Sora的复现又近了一步，我们发现当前模型存在许多不足，这些不足将在我们下阶段工作中得到改善。
+
+- **噪音的生成和影响**：我们发现生成的模型，特别是长视频中，有时很多噪点，不流畅。我们认为问题在于没有使用时间VAE。由于[Pixart-Sigma](https://arxiv.org/abs/2403.04692)发现适应新VAE很容易，我们计划在下一个版本中为模型开发时间VAE。
+- **缺乏时间一致性**：我们发现模型无法生成具有高时间一致性的视频，我们认为问题是由于缺乏训练FLOPs，我们计划收集更多数据并继续训练模型以提高时间一致性。
+- **人像生成质量低**：我们发现模型无法生成高质量的人类视频，我们认为问题是由于缺乏人类数据，我们计划收集更多的人类数据，并继续训练模型以提高人类生成。
+- **美学得分低**：我们发现模型的美学得分不高。问题在于缺少美学得分过滤，由于IO瓶颈没我们没有进行这一步骤。我们计划通过美学得分和微调模型来过滤数据，以提高美学得分。
+- **长视频生成质量低**：我们发现，使用同样的提示词，视频越长，质量越差。这意味着图像质量不能同等地被不同长度的序列所适应。
+
+> - **算法与加速实现**：Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
+> - **数据收集与处理**：Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu
diff --git a/docs/zh_CN/report_v3.md b/docs/zh_CN/report_v3.md
new file mode 100644
index 0000000..c62fcbe
--- /dev/null
+++ b/docs/zh_CN/report_v3.md
@@ -0,0 +1,159 @@
+# Open-Sora 1.2 报告
+
+- [视频压缩网络](#视频压缩网络)
+- [整流流和模型适应](#整流流和模型适应)
+- [更多数据和更好的多阶段训练](#更多数据和更好的多阶段训练)
+- [简单有效的模型调节](#简单有效的模型调节)
+- [评估](#评估)
+
+在 Open-Sora 1.2 版本中，我们在 >30M 数据上训练了 一个1.1B 的模型，支持 0s~16s、144p 到 720p、各种宽高比的视频生成。我们的配置如下所列。继 1.1 版本之后，Open-Sora 1.2 还可以进行图像到视频的生成和视频扩展。
+
+|      | 图像 | 2秒  | 4秒  | 8秒  | 16秒 |
+| ---- | ----- | --- | --- | --- | --- |
+| 240p | ✅     | ✅   | ✅   | ✅   | ✅   |
+| 360p | ✅     | ✅   | ✅   | ✅   | ✅   |
+| 480p | ✅     | ✅   | ✅   | ✅   | 🆗   |
+| 720p | ✅     | ✅   | ✅   | 🆗   | 🆗   |
+
+这里✅表示在训练期间可以看到数据，🆗表示虽然没有经过训练，但模型可以在该配置下进行推理。🆗的推理需要多个80G内存的GPU和序列并行。
+
+除了 Open-Sora 1.1 中引入的功能外，Open-Sora 1.2 还有以下重磅更新：
+
+- 视频压缩网络
+- 整流流训练
+- 更多数据和更好的多阶段训练
+- 简单有效的模型调节
+- 更好的评估指标
+
+上述改进的所有实现（包括训练和推理）均可在 Open-Sora 1.2 版本中使用。以下部分将介绍改进的细节。我们还改进了代码库和文档，使其更易于使用。
+
+## 视频压缩网络
+
+对于 Open-Sora 1.0 & 1.1，我们使用了 stable-ai 的 83M 2D VAE，它仅在空间维度上压缩，将视频压缩 8x8 倍。为了减少时间维度，我们每三帧提取一帧。然而，这种方法导致生成的视频流畅度较低，因为牺牲了生成的帧率（fps）。因此，在这个版本中，我们引入了像 OpenAI 的 Sora 一样的视频压缩网络。该网络在时域上将视频大小压缩至四分之一，因此，我们不必再额外抽帧，而可以使用原有帧率生成模型。
+
+考虑到训练 3D VAE 的计算成本很高，我们希望重新利用在 2D VAE 中学到的知识。我们注意到，经过 2D VAE 压缩后，时间维度上相邻的特征仍然高度相关。因此，我们提出了一个简单的视频压缩网络，首先将视频在空间维度上压缩 8x8 倍，然后将视频在时间维度上压缩 4 倍。网络如下所示：
+
+![video_compression_network](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_3d_vae.png)
+
+我们用[SDXL 的 VAE](https://huggingface.co/stabilityai/sdxl-vae)初始化 2D VAE ，它比我们以前使用的更好。对于 3D VAE，我们采用[Magvit-v2](https://magvit.cs.cmu.edu/v2/)中的 VAE 结构，它包含 300M 个参数。加上 83M 的 2D VAE，视频压缩网络的总参数为 384M。我们设定batch size 为 1， 对 3D VAE 进行了 1.2M 步的训练。训练数据是来自 pixels 和 pixabay 的视频，训练视频大小主要是 17 帧，256x256 分辨率。3D VAE 中使用causal convolotions使图像重建更加准确。
+
+我们的训练包括三个阶段：
+
+1. 对于前 380k 步，我们冻结 2D VAE并在 8 个 GPU 上进行训练。训练目标包括重建 2D VAE 的压缩特征（图中粉红色），并添加损失以使 3D VAE 的特征与 2D VAE 的特征相似（粉红色和绿色，称为identity loss）。我们发现后者的损失可以快速使整个 VAE 在图像上取得良好的性能，并在下一阶段更快地收敛。
+2. 对于接下来的 260k 步，我们消除identity loss并仅学习 3D VAE。
+3. 对于最后 540k 步，由于我们发现仅重建 2D VAE 的特征无法带来进一步的改进，因此我们移除了loss并训练整个 VAE 来重建原始视频。此阶段在 24 个 GPU 上进行训练。
+
+对于训练的前半部分，我们采用 20% 的图像和 80% 的视频。按照[Magvit-v2](https://magvit.cs.cmu.edu/v2/)，我们使用 17 帧训练视频，同时对图像的前 16 帧进行零填充。然而，我们发现这种设置会导致长度不同于 17 帧的视频变得模糊。因此，在第 3 阶段，我们使用不超过34帧长度的任意帧长度视频进行混合视频长度训练,以使我们的 VAE 对不同视频长度更具鲁棒性（也就是说，如果我们希望训练含有n帧的视频，我们就把原视频中`34-n`帧用0进行填充）。我们的 [训练](/scripts/train_vae.py)和[推理](/scripts/inference_vae.py)代码可在 Open-Sora 1.2 版本中找到。
+
+当使用 VAE 进行扩散模型时，我们的堆叠 VAE 所需的内存较少，因为我们的 VAE 的输入已经经过压缩。我们还将输入视频拆分为几个 17 帧剪辑，以提高推理效率。我们的 VAE 与[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md)中的另一个开源 3D VAE 性能相当。
+
+| 模型          | 结构相似性↑ | 峰值信噪比↑  |
+| ------------------ | ----- | ------ |
+| Open-Sora-Plan 1.1 | 0.882 | 29.890 |
+| Open-Sora 1.2      | 0.880 | 30.590 |
+
+## 整流流和模型适应
+
+最新的扩散模型 Stable Diffusion 3 为了获得更好的性能，采用了[rectified flow](https://github.com/gnobitab/RectifiedFlow)替代了 DDPM。可惜 SD3 的 rectified flow 训练代码没有开源。不过 Open-Sora 1.2 提供了遵循 SD3 论文的训练代码，包括：
+
+- 基本整流流训练
+- 用于训练加速的 Logit-norm 采样
+- 分辨率和视频长度感知时间步长采样
+
+对于分辨率感知的时间步长采样，我们应该对分辨率较大的图像使用更多的噪声。我们将这个想法扩展到视频生成，对长度较长的视频使用更多的噪声。
+
+Open-Sora 1.2 从[PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) 模型checkpoint开始。请注意，此模型使用 DDPM 和 SDXL VAE 进行训练，分辨率也高得多。我们发现在小数据集上进行微调可以轻松地使模型适应我们的视频生成设置。适应过程如下，所有训练都在 8 个 GPU 上完成：
+
+1. 多分辨率图像生成能力：我们训练模型以 20k 步生成从 144p 到 2K 的不同分辨率。
+2. QK-norm：我们将 QK-norm 添加到模型中并训练 18k 步。
+3. 整流流：我们从离散时间 DDPM 转变为连续时间整流流并训练 10k 步。
+4. 使用 logit-norm 采样和分辨率感知时间步采样的整流流：我们训练 33k 步。
+5. 较小的 AdamW epsilon：按照 SD3，使用 QK-norm，我们可以对 AdamW 使用较小的 epsilon（1e-15），我们训练 8k 步。
+6. 新的 VAE 和 fps 调节：我们用自己的 VAE 替换原来的 VAE，并将 fps 调节添加到时间步调节中，我们训练 25k 步。请注意，对每个通道进行规范化对于整流流训练非常重要。
+7. 时间注意力模块：我们添加时间注意力模块，其中没有初始化投影层。我们在图像上进行 3k 步训练。
+8. 仅针对具有掩码策略的视频的时间块：我们仅在视频上训练时间注意力块，步长为 38k。
+
+经过上述调整后，我们就可以开始在视频上训练模型了。上述调整保留了原始模型生成高质量图像的能力，并未后续的视频生成提供了许多助力：
+
+- 通过整流，我们可以加速训练，将视频的采样步数从100步减少到30步，大大减少了推理的等待时间。
+- 使用 qk-norm，训练更加稳定，并且可以使用积极的优化器。
+- 采用新的VAE，时间维度压缩了4倍，使得训练更加高效。
+- 该模型具有多分辨率图像生成能力，可以生成不同分辨率的视频。
+
+## 更多数据和更好的多阶段训练
+
+由于计算预算有限，我们精心安排了训练数据的质量从低到高，并将训练分为三个阶段。我们的训练涉及 12x8 GPU，总训练时间约为 2 周， 约70k步。
+
+### 第一阶段
+
+我们首先在 Webvid-10M 数据集（40k 小时）上训练模型，共 30k 步（2 个 epoch）。由于视频分辨率均低于 360p 且包含水印，因此我们首先在此数据集上进行训练。训练主要在 240p 和 360p 上进行，视频长度为 2s~16s。我们使用数据集中的原始字幕进行训练。训练配置位于[stage1.py](/configs/opensora-v1-2/train/stage1.py)中。
+
+### 第二阶段
+
+然后我们在 Panda-70M 数据集上训练模型。这个数据集很大，但质量参差不齐。我们使用官方的 30M 子集，其中的片段更加多样化，并过滤掉美学评分低于 4.5 的视频。这产生了一个 20M 子集，包含 41k 小时。数据集中的字幕直接用于我们的训练。训练配置位于[stage2.py](/configs/opensora-v1-2/train/stage2.py)中。
+
+训练主要在 360p 和 480p 上进行。我们训练模型 23k 步，即 0.5 个 epoch。训练尚未完成，因为我们希望我们的新模型能早日与大家见面。
+
+### 第三阶段
+
+在此阶段，我们从各种来源收集了 200 万个视频片段，总时长 5000 小时，其中包括：
+
+- 来自 Pexels、Pixabay、Mixkit 等的免费授权视频。
+- [MiraData](https://github.com/mira-space/MiraData)：一个包含长视频的高质量数据集，主要来自游戏和城市/风景探索。
+- [Vript](https://github.com/mutonix/Vript/tree/main)：一个密集注释的数据集。
+- 还有一些其他数据集。
+
+MiraData 和 Vript 有来自 GPT 的字幕，而我们使用[PLLaVA](https://github.com/magic-research/PLLaVA)为其余字幕添加字幕。与只能进行单帧/图像字幕的 LLaVA 相比，PLLaVA 是专门为视频字幕设计和训练的。[加速版PLLaVA](/tools/caption/README.md#pllava-captioning)已在我们的`tools/`中发布。在实践中，我们使用预训练的 PLLaVA 13B 模型，并从每个视频中选择 4 帧生成字幕，空间池化形状为 2*2。
+
+下面显示了此阶段使用的视频数据的一些统计数据。我们提供了持续时间和分辨率的基本统计数据，以及美学分数和光流分数分布。我们还从视频字幕中提取了对象和动作的标签并计算了它们的频率。
+![stats](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_video_stats.png)
+![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_objects_count.png)
+![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_actions_count.png)
+
+此阶段我们主要在 720p 和 1080p 上进行训练，以提高模型在高清视频上的表现力。在训练中，我们使用的掩码率为25%。训练配置位于[stage3.py](/configs/opensora-v1-2/train/stage3.py)中。我们对模型进行 15k 步训练，大约为 2 个 epoch。
+
+## 简单有效的模型调节
+
+对于第 3 阶段，我们计算每个视频片段的美学分数和运动分数。但是，由于视频片段数量较少，我们不愿意过滤掉得分较低的片段，这会导致数据集较小。相反，我们将分数附加到字幕中并将其用作条件。我们发现这种方法可以让模型了解分数并遵循分数来生成质量更好的视频。
+
+例如，一段美学评分为 5.5、运动评分为 10 且检测到摄像头运动向左平移的视频，其字幕将为：
+
+```plaintext
+[Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left.
+```
+
+在推理过程中，我们还可以使用分数来调节模型。对于摄像机运动，我们仅标记了 13k 个具有高置信度的剪辑，并且摄像机运动检测模块已在我们的工具中发布。
+
+## 评估
+
+之前，我们仅通过人工评估来监控训练过程，因为 DDPM 训练损失与生成的视频质量没有很好的相关性。但是，对于校正流，如 SD3 中所述，我们发现训练损失与生成的视频质量有很好的相关性。因此，我们跟踪了 100 张图像和 1k 个视频的校正流评估损失。
+
+我们从 pixabay 中抽样了 1k 个视频作为验证数据集。我们计算了不同分辨率（144p、240p、360p、480p、720p）下图像和不同长度的视频（2s、4s、8s、16s）的评估损失。对于每个设置，我们等距采样 10 个时间步长。然后对所有损失取平均值。
+
+![Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_val_loss.png)
+![Video Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vid_val_loss.png)
+
+此外，我们还会在训练过程中跟踪[VBench](https://vchitect.github.io/VBench-project/)得分。VBench 是用于短视频生成的自动视频评估基准。我们用 240p 2s 视频计算 vbench 得分。这两个指标验证了我们的模型在训练过程中持续改进。
+
+![VBench](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vbench_score.png)
+
+所有评估代码均发布在`eval`文件夹中。查看[评估指南](/eval/README.md)了解更多详细信息。
+
+|模型        | 总得分 | 质量得分 | 语义分数 |
+| -------------- | ----------- | ------------- | -------------- |
+| Open-Sora V1.0 | 75.91%      | 78.81%        | 64.28%         |
+| Open-Sora V1.2 | 79.23%      | 80.71%        | 73.30%         |
+
+## 序列并行
+
+我们使用序列并行来支持长序列训练和推理。我们的实现基于Ulysses，工作流程如下所示。启用序列并行后，我们只需要将 `all-to-all` 通信应用于STDiT中的空间模块（spatial block），因为在序列维度上，只有对空间信息的计算是相互依赖的。
+
+![SP](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/sequence_parallelism.jpeg)
+
+目前，由于训练数据分辨率较小，我们尚未使用序列并行进行训练，我们计划在下一个版本中使用。至于推理，我们可以使用序列并行，以防您的 GPU 内存不足。下表显示，序列并行可以实现加速：
+
+| 分辨率 | 时长 | GPU数量 | 是否启用序列并行 |用时（秒） | 加速效果/GPU |
+| ---------- | ------- | -------------- | --------- | ------------ | --------------- |
+| 720p       | 16秒     | 1              | 否        | 547.97       | -               |
+| 720p       | 16s秒    | 2              | 是        | 244.38       | 12%             |
+
diff --git a/docs/zh_CN/report_v4.md b/docs/zh_CN/report_v4.md
new file mode 100644
index 0000000..56a833e
--- /dev/null
+++ b/docs/zh_CN/report_v4.md
@@ -0,0 +1,117 @@
+# Open-Sora 1.3 报告
+
+- [视频压缩网络](#视频压缩网络)
+- [升级版带位移窗口注意力的STDiT](#升级版带位移窗口注意力的STDiT)
+- [简单有效的模型条件控制](#简单有效的模型条件控制)
+- [评估方法](#评估方法)
+
+在Open-Sora 1.3版本中，我们在超过60M（约85k小时）的数据上训练了一个1.1B参数的模型，训练耗时35k H100 GPU小时，支持0~113帧、360p和720p分辨率以及多种宽高比的视频生成。我们的配置如下。延续1.2版本的特性，Open-Sora 1.3同样支持图像到视频的生成和视频延展。
+
+|      | image | 49 frames  | 65 frames  | 81 frames  | 97 frames | 113 frames |
+| ---- | ----- | ---------- | ---------- | ---------- | --------- | ---------- |
+| 360p | ✅     | ✅         | ✅         | ✅         | ✅         |✅          |
+| 720p | ✅     | ✅         | ✅         | ✅         | ✅         |✅          |
+
+这里✅表示在训练过程中已经见过的数据。
+
+除了Open-Sora 1.2中引入的特性外，Open-Sora 1.3的亮点包括：
+
+- 视频压缩网络
+- 升级版带位移窗口注意力的STDiT
+- 更多数据和更好的多阶段训练
+- 简单有效的模型条件控制
+- 更好的评估指标
+
+以上所有改进的实现（包括训练和推理）都在Open-Sora 1.3版本中提供。以下部分将详细介绍这些改进。我们还优化了代码库和文档以使其更易于使用和开发，并添加了LLM优化器来[优化输入提示词](/README.md#gpt-4o-prompt-refinement)并支持更多语言。
+
+## 视频压缩网络
+
+在Open-Sora 1.2中，视频压缩架构采用了模块化方法，分别处理空间和时间维度。基于Stability AI的SDXL VAE的空间VAE压缩单个帧的空间维度。时间VAE则处理来自空间VAE的潜在表示以实现时间压缩。这种两阶段设计实现了有效的空间和时间压缩，但也带来了一些限制。这些限制包括由于固定长度输入帧而导致的长视频处理效率低下、空间和时间特征之间缺乏无缝集成，以及在训练和推理过程中更高的内存需求。
+
+Open-Sora 1.3引入了统一的视频压缩方法。通过将空间和时间处理结合到单一框架中，并利用诸如分块3D卷积和动态帧支持等高级特性，Open-Sora 1.3实现了更好的效率、可扩展性和重建质量。以下是Open-Sora 1.3 VAE的主要改进：
+
+**1. 统一的时空处理：** 不同于使用独立的VAE进行空间和时间压缩，Open-Sora 1.3采用单一的编码器-解码器结构同时处理这两个维度。这种方法消除了中间表示和空间-时间模块之间的冗余数据传输的需求。
+
+**2. 分块3D卷积：** Open-Sora 1.3在时间维度上引入了分块3D卷积支持。通过将视频分解成更小的时间块，该特性实现了对更长视频序列的高效编码和解码，而不会增加内存开销。这一改进解决了Open-Sora 1.2在处理大量帧时的限制，确保了更高的时间压缩灵活性。
+
+**3. 动态微批次和微帧处理：** Open-Sora 1.3引入了新的微批次和微帧处理机制。这实现了：(1) 自适应时间重叠：时间编码和解码过程中的重叠帧帮助减少块边界的不连续性。(2) 动态帧大小支持：不再局限于固定长度序列（如Open-Sora 1.2中的17帧），Open-Sora 1.3支持动态序列长度，使其能够适应不同的视频长度。
+
+**4. 统一的归一化机制：** Open-Sora 1.3中的归一化过程通过可调的缩放(scale)和平移(shift)参数得到了改进，确保了不同数据集间潜在空间分布的一致性。与Open-Sora 1.2特定于固定数据集的归一化不同，这个版本引入了更通用的参数并支持特定于帧的归一化策略。
+
+#### 改进总结
+
+| 特性           | Open-Sora 1.2                    | Open-Sora 1.3                     |
+|---------------|---------------------------------|----------------------------------|
+| **架构**       | 独立的空间和时间VAE                 | 统一的时空VAE                      |
+| **分块处理**    | 不支持                           | 支持（分块3D卷积）                   |
+| **帧长度支持**  | 固定（17帧）                      | 支持动态帧长度和重叠                 |
+| **归一化**     | 固定参数                         | 可调的缩放和平移参数                 |
+
+## 包含滑动窗口注意力的STDiT
+
+在Open-Sora 1.2取得成功的基础上，1.3版本引入了多项架构改进和新功能，以提升视频生成的质量和灵活性。本节概述了这两个版本之间的主要改进和差异。
+
+最新的扩散模型（如Stable Diffusion 3）采用[rectified flow](https://github.com/gnobitab/RectifiedFlow)代替DDPM以获得更好的性能。虽然SD3的rectified flow训练代码未开源，但OpenSora按照SD3论文提供了训练代码实现。OpenSora 1.2从SD3引入了几个关键策略：
+
+1. 基础的rectified flow训练，实现连续时间扩散
+2. Logit-norm采样用于加速训练（遵循SD3论文第3.1节），优先在中等噪声水平采样时间步
+3. 分辨率和视频长度感知的时间步采样（遵循SD3论文第5.3.2节），对更大分辨率和更长视频使用更多噪声
+
+在OpenSora 1.3中，我们在架构、功能和性能方面进行了显著改进：
+
+#### 1. 位移窗口注意力机制
+- 引入可配置kernel_size的基于核的局部注意力，提高计算效率
+- 实现类似Swin Transformer的位移窗口分区策略
+- 增加带extra_pad_on_dims支持的窗口边界填充掩码处理
+- 在局部窗口（时间、高度、宽度）内扩展3D相对位置编码
+
+#### 2. 增强的位置编码
+- 改进RoPE实现，将rotation_dim降至原来的1/3以适应3D场景
+- 为时间、高度和宽度维度添加独立的旋转嵌入
+- 实现分辨率自适应的位置编码缩放
+- 可选的空间RoPE以更好地建模空间关系
+
+#### 3. 灵活的生成能力
+- 添加I2V和V2V功能，配备专门的条件控制机制
+- 引入条件嵌入模块（x_embedder_cond和x_embedder_cond_mask）
+- 零初始化条件嵌入以实现稳定训练
+- 通过skip_temporal选项实现灵活的时序建模
+
+#### 4. 性能优化
+- 改进Flash Attention触发条件（N > 128）以提高效率
+- 添加torch.scaled_dot_product_attention (SDPA)作为替代后端
+- 通过改进的填充和窗口分区优化内存使用
+- 通过自适应高度填充增强序列并行性
+
+从[PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma)的适应过程保持相似，但增加了额外步骤：
+[第1-7点与v1.2相同：多分辨率训练、QK-norm、rectified flow、logit-norm采样、更小的AdamW epsilon、新VAE和基础时序注意力]
+#### 8. 增强的时序模块
+   - 添加带位移窗口支持的基于核的局部注意力
+   - 实现带分辨率自适应缩放的3D相对位置编码
+   - 采用改进的初始化策略进行投影层零初始化
+
+相比专注于基础视频生成的v1.2，v1.3在三个关键领域带来了实质性改进：**1. 质量**：通过位移窗口注意力和3D位置编码增强时空建模。**2. 灵活性**：支持I2V/V2V任务和可配置的时序建模。**3. 效率**：优化注意力计算和内存使用
+
+这些改进在保持v1.2核心功能的同时，扩展了模型在实际应用中的能力。模型保留了使用rectified flow生成高质量图像和视频的能力，同时在条件生成和长序列建模方面获得了新的优势。
+
+## 简单有效的模型条件控制
+
+我们对每个视频片段计算美学分数和运动分数，并过滤掉得分较低的片段，从而得到一个视频质量更好的数据集。此外，我们将这些分数附加到标题中并用作条件控制。具体来说，我们基于预定义的范围将数值分数转换为描述性语言。美学分数转换函数基于预定义范围将数值美学分数转换为描述标签：低于4分标记为"terrible"，依次通过"very poor"、"poor"、"fair"、"good"和"very good"，6.5分或更高标记为"excellent"。同样，运动分数转换函数将运动强度分数映射为描述符：低于0.5分标记为"very low"，依次通过"low"、"fair"、"high"和"very high"，20分或更高标记为"extremely high"。我们发现这种方法可以使模型意识到这些分数并遵循分数来生成更高质量的视频。
+
+例如，对于一个美学分数为5.5，运动分数为10，检测到的相机运动为向左平移的视频，其标题将是：
+
+```plaintext
+[Original Caption] The aesthetic score is good, the motion strength is high, camera motion: pan left.
+```
+
+在推理过程中，我们也可以使用这些分数来控制模型。对于相机运动，我们只标记了13k个高置信度的片段，相机运动检测模块已在我们的工具中发布。
+
+## 评估方法
+
+此前，我们仅通过人工评估来监控训练过程，因为DDPM训练损失与生成视频的质量相关性不高。然而，对于rectified flow，我们发现正如SD3所述，训练损失与生成视频的质量有很好的相关性。因此，我们持续跟踪100张图像和1k个视频的rectified flow评估损失。
+
+我们从pixabay采样了1k个视频作为验证数据集。我们计算了不同分辨率（360p，720p）下图像和不同长度视频（49帧、65帧、81帧、97帧、113帧）的评估损失。对于每种设置，我们等距采样10个时间步。然后对所有损失取平均值。
+
+此外，我们还在训练期间跟踪[VBench](https://vchitect.github.io/VBench-project/)分数。VBench是一个用于短视频生成的自动视频评估基准。我们使用360p 49帧视频计算vbench分数。这两个指标验证了我们的模型在训练过程中持续改进。
+
+所有评估代码都在`eval`文件夹中发布。查看[README](/eval/README.md)获取更多详细信息。
\ No newline at end of file
diff --git a/eval/I2V/launch.sh b/eval/I2V/launch.sh
new file mode 100644
index 0000000..c8bdd9b
--- /dev/null
+++ b/eval/I2V/launch.sh
@@ -0,0 +1,106 @@
+BASE_MODEL_PATH=$1
+TRAINED_MODEL_PATH=$2
+I2V_HEAD_PATH=$3
+I2V_TAIL_PATH=$4
+I2V_LOOP_PATH=$5
+I2V_ORI_PATH=$6
+
+
+if [ -z $I2V_ORI_PATH ]; then
+    I2V_ORI_PATH="assets/texts/i2v/prompts_ori.txt"
+fi
+
+if [ -z $I2V_HEAD_PATH ]; then
+    I2V_HEAD_PATH="assets/texts/i2v/prompts_head.txt"
+fi
+
+if [ -z $I2V_TAIL_PATH ]; then
+    I2V_TAIL_PATH="assets/texts/i2v/prompts_tail.txt"
+fi
+
+if [ -z $I2V_LOOP_PATH ]; then
+    I2V_LOOP_PATH="assets/texts/i2v/prompts_loop.txt"
+fi
+
+STEP_RECORD=$(basename $TRAINED_MODEL_PATH)
+if [ -z $SAVE_DIR ]; then
+    SAVE_DIR="samples/i2v/test/${STEP_RECORD}"
+fi
+echo "save dir: ${SAVE_DIR}"
+
+if [ -z $NUM_FRAMES ]; then
+    NUM_FRAMES=49
+fi
+echo "num frames: ${NUM_FRAMES}"
+
+
+command="python scripts/inference_i2v.py configs/opensora-v1-3/inference/v2v.py"
+
+# # original uncond
+# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_uncond --use-sdedit False
+
+# # trained uncond
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_uncond --use-sdedit False
+
+# trained uncond
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_cond_none_image1osci --use-sdedit False --use-oscillation-guidance-for-image True --image-cfg-scale 1 --cond-type "none" --start-index 1 --end-index 2
+
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_cond_none_image1osci_bias0 --use-sdedit False --use-oscillation-guidance-for-image True --image-cfg-scale 1 --cond-type "none" --start-index 0 --end-index 2
+
+
+# trained cond: i2v_head
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image1.5osci_text7.5osci --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 1.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image1.5osci_text7.5osci_bias0 --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 1.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+
+# trained cond: i2v_tail
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_tail_image1.5osci_text7.5osci --use-sdedit False --cond-type i2v_tail --use-oscillation-guidance-for-image True --image-cfg-scale 1.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+# trained cond: i2v_head
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image2osci_text7.5osci --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 2 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+# trained cond: i2v_tail
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_tail_image2osci_text7.5osci --use-sdedit False --cond-type i2v_tail --use-oscillation-guidance-for-image True --image-cfg-scale 2 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+# trained cond: i2v_head
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image2.5osci_text7.5osci --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 2.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+# trained cond: i2v_tail
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_tail_image2.5osci_text7.5osci --use-sdedit False --cond-type i2v_tail --use-oscillation-guidance-for-image True --image-cfg-scale 2.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+
+
+# trained cond: i2v_loop
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop --use-sdedit False --cond-type i2v_loop --loop 2
+
+# # traind cond: i2v_loop
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_image1osci_text7.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-image True --image-cfg-scale 1 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+# # traind cond: i2v_loop
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_image2osci_text7.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-image True --image-cfg-scale 2 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+# # traind cond: i2v_loop
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_image3osci_text7.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-image True --image-cfg-scale 3 --use-oscillation-guidance-for-text True --cfg-scale 7.5
+
+# # traind cond: i2v_loop, cfg text osci
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_text7osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-text True --cfg-scale 7
+
+# # trained cond: i2v_loop, image text osci
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_text3.5osci_image3.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-text True --cfg-scale 3.5 --use-oscillation-guidance-for-image Tru
+
+# # trained cond: i2v_loop, image text oscie --image-cfg-scale 3.5
+# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_text7osci_image3.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-text True --cfg-scale 7 --use-oscillation-guidance-for-image True --image-cfg-scale 3.5
+
+
+
+
+
+
+# # base cond: i2v_head
+# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_i2v_head --use-sdedit False --cond-type i2v_head
+
+# # base cond: i2v_tail
+# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_i2v_tail --use-sdedit False --cond-type i2v_tail
+
+# # base cond: i2v_loop
+# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_i2v_loop --use-sdedit False --cond-type i2v_loop
diff --git a/eval/README.md b/eval/README.md
new file mode 100644
index 0000000..1689ffd
--- /dev/null
+++ b/eval/README.md
@@ -0,0 +1,114 @@
+# Evalution
+
+## Human evaluation
+
+To conduct human evaluation, we need to generate various samples. We provide many prompts in `assets/texts`, and defined some test setting covering different resolution, duration and aspect ratio in `eval/sample.sh`. To facilitate the usage of multiple GPUs, we split sampling tasks into several parts.
+
+```bash
+# image (1)
+bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -1
+# video (2a 2b 2c ...)
+bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -2a
+# launch 8 jobs at once (you must read the script to understand the details)
+bash eval/human_eval/launch.sh /path/to/ckpt num_frames model_name_for_log
+```
+
+## Rectified Flow Loss
+
+Evaluate the rectified flow loss with the following commands.
+
+```bash
+# image
+torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-3/misc/eval_loss.py --data-path /path/to/img.csv --ckpt-path /path/to/ckpt
+
+# video
+torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-3/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt
+
+# select resolution
+torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-3/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt --resolution 720p
+```
+
+To launch multiple jobs at once, use the following script.
+
+```bash
+bash eval/loss/launch.sh /path/to/ckpt model_name
+```
+
+To obtain an organized list of scores:
+```bash
+python eval/loss/tabulate_rl_loss.py --log_dir path/to/log/dir
+```
+
+## VBench
+
+[VBench](https://github.com/Vchitect/VBench) is a benchmark for short text to video generation. We provide a script for easily generating samples required by VBench.
+
+First, generate the relevant videos with the following commands:
+
+```bash
+# vbench task, if evaluation all set start_index to 0, end_index to 2000
+bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -4 start_index end_index
+
+# Alternatively, launch 8 jobs at once (you must read the script to understand the details)
+bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
+
+# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
+```
+
+After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
+
+<!-- ```bash
+bash eval/vbench/vbench.sh /path/to/video_folder /path/to/model/ckpt
+``` -->
+
+```bash
+python eval/vbench/calc_vbench.py /path/to/video_folder /path/to/model/ckpt
+```
+
+Finally, we obtain the scaled scores for the model by:
+```bash
+python eval/vbench/tabulate_vbench_scores.py --score_dir path/to/score/dir
+```
+
+## VBench-i2v
+
+[VBench-i2v](https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v) is a benchmark for short image to video generation (beta version).
+Similarly, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies".
+
+```bash
+# Step 1: generate the relevant videos
+# vbench i2v tasks, if evaluation all set start_index to 0, end_index to 2000
+bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -5 start_index end_index
+# Alternatively, launch 8 jobs at once
+bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name
+
+# Step 2: run vbench to evaluate the generated samples
+python eval/vbench_i2v/vbench_i2v.py /path/to/video_folder /path/to/model/ckpt
+# Note that if you need to go to `VBench/vbench2_beta_i2v/utils.py` and change the harded-coded var `image_root` in the `load_i2v_dimension_info` function to your corresponding image folder.
+
+# Step 3: obtain the scaled scores
+python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/to/your/model/ckpt
+# this will store the results under `eval/vbench_i2v` in the path/to/your/model/ckpt
+
+```
+
+Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+
+```bash
+bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 360p 9:16 30 2 True
+# if no flow control, use "None" instead
+```
+
+## VAE
+
+Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:
+
+```bash
+# metric can any one or list of: ssim, psnr, lpips, flolpips
+python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir path/to/original/videos --generated_video_dir path/to/generated/videos --device cuda --sample_fps 24 --crop_size 360 --resolution 360p --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
+```
diff --git a/eval/human_eval/generate.sh b/eval/human_eval/generate.sh
new file mode 100644
index 0000000..5dd3ab7
--- /dev/null
+++ b/eval/human_eval/generate.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -x
+set -e
+
+TEXT_PATH=/home/data/sora_data/pixart-sigma-generated/text.txt
+OUTPUT_PATH=/home/data/sora_data/pixart-sigma-generated/raw
+CMD="python scripts/inference.py configs/pixart/inference/1x2048MS.py"
+# LOG_BASE=logs/sample/generate
+LOG_BASE=$(dirname $CKPT)/eval/generate
+mkdir -p ${LOG_BASE}
+NUM_PER_GPU=10000
+N_LAUNCH=2
+NUM_START=$(($N_LAUNCH * $NUM_PER_GPU * 8))
+
+CUDA_VISIBLE_DEVICES=0 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 0)) --end-index $(($NUM_START + $NUM_PER_GPU * 1)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_1.log 2>&1 &
+CUDA_VISIBLE_DEVICES=1 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 1)) --end-index $(($NUM_START + $NUM_PER_GPU * 2)) --image-size 1408 2816 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_2.log 2>&1 &
+CUDA_VISIBLE_DEVICES=2 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 2)) --end-index $(($NUM_START + $NUM_PER_GPU * 3)) --image-size 2816 1408 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_3.log 2>&1 &
+CUDA_VISIBLE_DEVICES=3 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 3)) --end-index $(($NUM_START + $NUM_PER_GPU * 4)) --image-size 1664 2304 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_4.log 2>&1 &
+CUDA_VISIBLE_DEVICES=4 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 4)) --end-index $(($NUM_START + $NUM_PER_GPU * 5)) --image-size 2304 1664 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_5.log 2>&1 &
+CUDA_VISIBLE_DEVICES=5 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 5)) --end-index $(($NUM_START + $NUM_PER_GPU * 6)) --image-size 1536 2560 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_6.log 2>&1 &
+CUDA_VISIBLE_DEVICES=6 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 6)) --end-index $(($NUM_START + $NUM_PER_GPU * 7)) --image-size 2560 1536 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_7.log 2>&1 &
+CUDA_VISIBLE_DEVICES=7 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 7)) --end-index $(($NUM_START + $NUM_PER_GPU * 8)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_8.log 2>&1 &
diff --git a/eval/human_eval/launch.sh b/eval/human_eval/launch.sh
new file mode 100644
index 0000000..bcd2d24
--- /dev/null
+++ b/eval/human_eval/launch.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+CKPT=$1
+NUM_FRAMES=$2
+MODEL_NAME=$3
+
+if [[ $CKPT == *"ema"* ]]; then
+    parentdir=$(dirname $CKPT)
+    CKPT_BASE=$(basename $parentdir)_ema
+else
+    CKPT_BASE=$(basename $CKPT)
+fi
+LOG_BASE=$(dirname $CKPT)/eval
+mkdir -p ${LOG_BASE}
+echo "Logging to $LOG_BASE"
+
+GPUS=(0 1 2 3 4 5 6 7)
+# TASK_ID_LIST=(1 2a 2b 2c 2d 2e 2f 2g) # move image to video task
+TASK_ID_LIST=(2a 2b 2c 2d 2e 2f 2g 2h)
+# FRAME_LIST=(1 $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES)
+
+for i in "${!GPUS[@]}"; do
+    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -${TASK_ID_LIST[i]} >${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+done
+
+# kill all by: pkill -f "inference"
diff --git a/eval/loss/eval_loss.py b/eval/loss/eval_loss.py
new file mode 100644
index 0000000..0f0746d
--- /dev/null
+++ b/eval/loss/eval_loss.py
@@ -0,0 +1,183 @@
+from pprint import pformat
+
+import colossalai
+import torch
+import torch.distributed as dist
+from colossalai.cluster import DistCoordinator
+from mmengine.runner import set_random_seed
+from tqdm import tqdm
+
+from opensora.acceleration.parallel_states import get_data_parallel_group, set_data_parallel_group
+from opensora.datasets.dataloader import prepare_dataloader
+from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
+from opensora.utils.config_utils import parse_configs
+from opensora.utils.misc import create_logger, to_torch_dtype
+from opensora.utils.train_utils import MaskGenerator
+
+
+def main():
+    torch.set_grad_enabled(False)
+    # ======================================================
+    # configs & runtime variables
+    # ======================================================
+    # == parse configs ==
+    cfg = parse_configs(training=False)
+
+    # == device and dtype ==
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    cfg_dtype = cfg.get("dtype", "fp32")
+    assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}"
+    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    # == init distributed env ==
+    colossalai.launch_from_torch({})
+    DistCoordinator()
+    set_random_seed(seed=cfg.get("seed", 1024))
+    set_data_parallel_group(dist.group.WORLD)
+
+    # == init logger ==
+    logger = create_logger()
+    logger.info("Eval loss configuration:\n %s", pformat(cfg.to_dict()))
+
+    # ======================================================
+    # build model & load weights
+    # ======================================================
+    logger.info("Building models...")
+    # == build text-encoder and vae ==
+    text_encoder = build_module(cfg.text_encoder, MODELS, device=device)
+    if text_encoder is not None:
+        text_encoder_output_dim = text_encoder.output_dim
+        text_encoder_model_max_length = text_encoder.model_max_length
+        cfg.dataset.tokenize_fn = text_encoder.tokenize_fn
+    else:
+        text_encoder_output_dim = cfg.get("text_encoder_output_dim", 4096)
+        text_encoder_model_max_length = cfg.get("text_encoder_model_max_length", 300)
+
+    vae = build_module(cfg.vae, MODELS).to(device, dtype).eval()
+
+    # == build diffusion model ==
+    input_size = (None, None, None)
+    latent_size = vae.get_latent_size(input_size)
+    model = (
+        build_module(
+            cfg.model,
+            MODELS,
+            input_size=latent_size,
+            in_channels=vae.out_channels,
+            caption_channels=text_encoder_output_dim,
+            model_max_length=text_encoder_model_max_length,
+            enable_sequence_parallelism=cfg.get("sp_size", 1) > 1,
+        )
+        .to(device, dtype)
+        .eval()
+    )
+    text_encoder.y_embedder = model.y_embedder  # HACK: for classifier-free guidance
+
+    # == build scheduler ==
+    scheduler = build_module(cfg.scheduler, SCHEDULERS)
+
+    if cfg.get("mask_ratios", None) is not None:
+        mask_generator = MaskGenerator(cfg.mask_ratios)
+
+    # ======================================================
+    # inference
+    # ======================================================
+    # start evaluation, prepare a dataset everytime in the loop
+    bucket_config = cfg.bucket_config
+    if cfg.get("resolution", None) is not None:
+        bucket_config = {cfg.resolution: bucket_config[cfg.resolution]}
+    assert bucket_config is not None, "bucket_config is required for evaluation"
+    logger.info("Evaluating bucket_config: %s", bucket_config)
+
+    def build_dataset(resolution, num_frames, batch_size):
+        bucket_config = {resolution: {num_frames: (1.0, batch_size)}}
+        dataset = build_module(cfg.dataset, DATASETS)
+        dataloader_args = dict(
+            dataset=dataset,
+            batch_size=None,
+            num_workers=cfg.num_workers,
+            shuffle=False,
+            drop_last=False,
+            pin_memory=True,
+            process_group=get_data_parallel_group(),
+        )
+        dataloader, sampler = prepare_dataloader(bucket_config=bucket_config, **dataloader_args)
+        num_batch = sampler.get_num_batch()
+        num_steps_per_epoch = num_batch // dist.get_world_size()
+        return dataloader, num_steps_per_epoch, num_batch
+
+    evaluation_losses = {}
+    start = cfg.start_index if "start_index" in cfg else 0
+    end = cfg.end_index if "end_index" in cfg else len(bucket_config)
+    for i, res in enumerate(bucket_config):
+        if len(bucket_config) > 1 and (i < start or i >= end):  # skip task
+            print("skipping:", bucket_config[res])
+            continue
+
+        t_bucket = bucket_config[res]
+        num_frames_index = 0
+        for num_frames, (_, batch_size) in t_bucket.items():
+            if batch_size is None:
+                continue
+
+            if len(bucket_config) == 1 and (num_frames_index < start or num_frames_index >= end):  # skip task
+                print("skipping:", num_frames)
+                num_frames_index += 1
+                continue
+            else:
+                num_frames_index += 1
+            logger.info("Evaluating resolution: %s, num_frames: %s", res, num_frames)
+            dataloader, num_steps_per_epoch, num_batch = build_dataset(res, num_frames, batch_size)
+            if num_batch == 0:
+                logger.warning("No data for resolution: %s, num_frames: %s", res, num_frames)
+                continue
+
+            evaluation_t_losses = []
+            for t in torch.linspace(0, scheduler.num_timesteps, cfg.get("num_eval_timesteps", 10) + 2)[1:-1]:
+                loss_t = 0.0
+                num_samples = 0
+                dataloader_iter = iter(dataloader)
+                for _ in tqdm(range(num_steps_per_epoch), desc=f"res: {res}, num_frames: {num_frames}, t: {t:.2f}"):
+                    batch = next(dataloader_iter)
+                    x = batch.pop("video").to(device, dtype)
+                    batch.pop("text")
+                    x = vae.encode(x)
+                    input_ids = batch.pop("input_ids")
+                    attention_mask = batch.pop("attention_mask")
+                    model_args = text_encoder.encode(input_ids, attention_mask=attention_mask)
+
+                    # == mask ==
+                    mask = None
+                    if cfg.get("mask_ratios", None) is not None:
+                        mask = mask_generator.get_masks(x)
+                        model_args["x_mask"] = mask
+
+                    # == video meta info ==
+                    for k, v in batch.items():
+                        model_args[k] = v.to(device, dtype)
+
+                    # == diffusion loss computation ==
+                    timestep = torch.tensor([t] * x.shape[0], device=device, dtype=dtype)
+                    loss_dict = scheduler.training_losses(model, x, model_args, mask=mask, t=timestep)
+                    losses = loss_dict["loss"]  # (batch_size)
+                    num_samples += x.shape[0]
+                    loss_t += losses.sum().item()
+                loss_t /= num_samples
+                evaluation_t_losses.append(loss_t)
+                logger.info("resolution: %s, num_frames: %s, timestep: %.2f, loss: %.4f", res, num_frames, t, loss_t)
+
+            evaluation_losses[(res, num_frames)] = sum(evaluation_t_losses) / len(evaluation_t_losses)
+            logger.info(
+                "Evaluation losses for resolution: %s, num_frames: %s, loss: %s\n %s",
+                res,
+                num_frames,
+                evaluation_losses[(res, num_frames)],
+                evaluation_t_losses,
+            )
+    logger.info("Evaluation losses: %s", evaluation_losses)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/loss/launch.sh b/eval/loss/launch.sh
new file mode 100644
index 0000000..c70c52d
--- /dev/null
+++ b/eval/loss/launch.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
+CKPT_PATH=$1
+MODEL_NAME=$2
+IMG_PATH=$3
+VID_PATH=$4
+
+if [ -z $IMG_PATH ]; then
+    IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
+fi
+
+if [ -z $VID_PATH ]; then
+    VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+fi
+
+if [[ $CKPT_PATH == *"ema"* ]]; then
+    parentdir=$(dirname $CKPT_PATH)
+    CKPT_BASE=$(basename $parentdir)_ema
+else
+    CKPT_BASE=$(basename $CKPT_PATH)
+fi
+LOG_BASE=$(dirname $CKPT_PATH)/eval
+mkdir -p $LOG_BASE
+echo "Logging to $LOG_BASE"
+
+
+GPUS=(3 4 5 6 7)
+RESOLUTION=(144p 240p 360p 480p 720p)
+
+CUDA_VISIBLE_DEVICES=0 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 0 --end-index 5 >${LOG_BASE}/img_0.log 2>&1 &
+CUDA_VISIBLE_DEVICES=1 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 5 --end-index 6 >${LOG_BASE}/img_1.log 2>&1 &
+CUDA_VISIBLE_DEVICES=2 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 6 >${LOG_BASE}/img_2.log 2>&1 &
+
+
+for i in "${!GPUS[@]}"; do
+    CUDA_VISIBLE_DEVICES=${GPUS[i]} $CMD --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION[i]} >${LOG_BASE}/${RESOLUTION[i]}_vid.log 2>&1 &
+done
diff --git a/eval/loss/launch_single.sh b/eval/loss/launch_single.sh
new file mode 100644
index 0000000..dfabe01
--- /dev/null
+++ b/eval/loss/launch_single.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+CMD_FILE_CONFIG="eval/loss/eval_loss.py configs/opensora-pro/misc/eval_loss.py"
+PORTS=$1
+CKPT_PATH=$2
+VID_PATH=$3
+
+# only evaluate for 360p, 102f
+RESOLUTION=360p
+torchrun --master-port ${PORTS} --nproc_per_node 1 $CMD_FILE_CONFIG --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION}
diff --git a/eval/loss/tabulate_rl_loss.py b/eval/loss/tabulate_rl_loss.py
new file mode 100644
index 0000000..4c623c0
--- /dev/null
+++ b/eval/loss/tabulate_rl_loss.py
@@ -0,0 +1,55 @@
+"""
+usage:
+    python tabulate_rl_loss.py --log_dir /home/zhengzangwei/projs/Open-Sora-dev/logs/loss --ckpt_name epoch0-global_step9000
+
+save the processed json to:
+    Open-Sora-dev/evaluation_results/rectified_flow/<ckpt_name>_loss.json
+"""
+
+import argparse
+import json
+import os
+from ast import literal_eval
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", type=str)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    files = os.listdir(args.log_dir)
+    files = [
+        "img_0.log",
+        "img_1.log",
+        "img_2.log",
+        "144p_vid.log",
+        "240p_vid.log",
+        "360p_vid.log",
+        "480p_vid.log",
+        "720p_vid.log",
+    ]
+
+    loss_info = {}
+
+    for fname in files:
+        path = os.path.join(args.log_dir, fname)
+        with open(path, "r", encoding="utf-8") as f:
+            content = f.readlines()
+        eval_line = content[-1].split("losses:")[-1].strip()
+        loss_dict = literal_eval(eval_line)
+        for key, loss in loss_dict.items():
+            resolution, frame = key
+            if resolution not in loss_info:
+                loss_info[resolution] = {}
+            loss_info[resolution][frame] = format(loss, ".4f")
+
+    # Convert and write JSON object to file
+    output_file_path = os.path.join(args.log_dir, "loss.json")
+    with open(output_file_path, "w") as outfile:
+        json.dump(loss_info, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {output_file_path}")
diff --git a/eval/sample.sh b/eval/sample.sh
new file mode 100644
index 0000000..0533d30
--- /dev/null
+++ b/eval/sample.sh
@@ -0,0 +1,311 @@
+# !/bin/bash
+
+CKPT=$1
+NUM_FRAMES=$2
+MODEL_NAME=$3
+TASK_TYPE=$4
+VBENCH_START_INDEX=$5
+VBENCH_END_INDEX=$6
+VBENCH_RES=$7
+VBENCH_ASP_RATIO=$8
+
+NUM_SAMPLING_STEPS=$9
+FLOW=${10}
+LLM_REFINE=${11}
+
+BASE_ASPECT_RATIO=360p
+ASPECT_RATIOS=(360p 720p)
+# Loop through the list of aspect ratios
+i=0
+for r in "${ASPECT_RATIOS[@]}"; do
+  if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then
+    # get aspect ratio 1 level up
+    if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]}
+    fi
+    # get aspect ratio 2 levels up
+    if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]}
+    fi
+  fi
+  i=$((i+1))
+done
+echo "base aspect ratio: ${BASE_ASPECT_RATIO}"
+echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}"
+echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}"
+echo "Note that this aspect ratio level setting is used for videos only, not images"
+
+echo "NUM_FRAMES=${NUM_FRAMES}"
+
+if [ -z "${NUM_FRAMES}" ]; then
+  echo "you need to pass NUM_FRAMES"
+else
+  let DOUBLE_FRAMES=$2*2
+  let QUAD_FRAMES=$2*4
+  let OCT_FRAMES=$2*8
+fi
+
+echo "DOUBLE_FRAMES=${DOUBLE_FRAMES}"
+echo "QUAD_FRAMES=${QUAD_FRAMES}"
+echo "OCT_FRAMES=${OCT_FRAMES}"
+
+# CMD="python scripts/inference.py configs/opensora-v1-2/inference/sample.py"
+CMD="python scripts/inference.py configs/opensora-v1-3/inference/t2v.py"
+CMD_I2V="python scripts/inference_i2v.py configs/opensora-v1-3/inference/v2v.py"
+
+if [[ $CKPT == *"ema"* ]]; then
+  parentdir=$(dirname $CKPT)
+  CKPT_BASE=$(basename $parentdir)_ema
+else
+  CKPT_BASE=$(basename $CKPT)
+fi
+OUTPUT="/mnt/jfs-hdd/sora/samples/samples_${MODEL_NAME}_${CKPT_BASE}"
+start=$(date +%s)
+DEFAULT_BS=1
+
+### Functions
+
+# called inside run_video_b
+function run_image() {
+  # 360p multi-sample
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_sora_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_short_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_t2v_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_t2i_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
+
+  # 720p multi-resolution
+  # 1:1
+  PROMPT="Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens."
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:1 --sample-name image_720p_1_1
+  # 9:16
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 9:16 --sample-name image_720p_9_16
+  # 16:9
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 16:9 --sample-name image_720p_16_9
+  # 4:3
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 4:3 --sample-name image_720p_4_3
+  # 3:4
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 3:4 --sample-name image_720p_3_4
+  # 1:2
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:2 --sample-name image_720p_1_2
+  # 2:1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 2:1 --sample-name image_720p_2_1
+}
+
+function run_video_a() {
+  # sample, 720p, 9:16
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 97 --resolution 720p --aspect-ratio 9:16 --sample-name sample_97_720p --batch-size $DEFAULT_BS
+
+  # sample, 360p, 9:16
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 97 --resolution 360p --aspect-ratio 9:16 --sample-name sample_97_360p --batch-size $DEFAULT_BS
+
+  # sample random type, 720p, 9:16
+  if [[ -z "${OPENAI_API_KEY}" ]];
+    then
+      echo "Error: Required environment variable 'OPENAI_API_KEY' is not set."
+      exit 1
+    else
+      eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/rand_types.txt --save-dir $OUTPUT --num-frames 97 --resolution 720p --aspect-ratio 9:16 --sample-name rand_types_2s_720p --batch-size $DEFAULT_BS --llm-refine True
+  fi
+}
+
+function run_video_b() {
+  echo "Inside run_video_b, running image samples..."
+  run_image
+
+  echo "Inside run_video_b, running video samples..."
+
+  # short, 720p, 9:16
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 97 --resolution 720p --aspect-ratio 9:16 --sample-name short_97_720p --batch-size $DEFAULT_BS
+
+  # short, 360p, 9:16
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 97 --resolution 360p --aspect-ratio 9:16 --sample-name short_97_360p --batch-size $DEFAULT_BS
+}
+
+function run_video_c() {
+  # 720p, multi-resolution
+  # 1:1
+  PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 1:1 --sample-name drone_cliff_prompt_720p_49_1_1
+  # 16:9
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 16:9 --sample-name drone_cliff_prompt_720p_49_16_9
+  # 9:16
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 9:16 --sample-name drone_cliff_prompt_720p_49_9_16
+  # 4:3
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 4:3 --sample-name drone_cliff_prompt_720p_49_4_3
+  # 3:4
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 3:4 --sample-name drone_cliff_prompt_720p_49_3_4
+  # 1:2
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 1:2 --sample-name drone_cliff_prompt_720p_49_1_2
+  # 2:1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 2:1 --sample-name drone_cliff_prompt_720p_49_2_1
+
+  # add motion score
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \
+    \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \
+    \"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \
+    \"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \
+    \"A stylish woman walking in the street of Tokyo. motion score: 6.0\" \
+    \"A stylish woman walking in the street of Tokyo. motion score: 10.0\" \
+    \"A stylish woman walking in the street of Tokyo. motion score: 25.0\" \
+    \"A stylish woman walking in the street of Tokyo. motion score: 50.0\" \
+    \"A stylish woman walking in the street of Tokyo. motion score: 100.0\"
+
+  # add aes score
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \
+    \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \
+    \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \
+    \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \
+    \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.5\" \
+    \"A stylish woman walking in the street of Tokyo. aesthetic score: 6.0\" \
+    \"A stylish woman walking in the street of Tokyo. aesthetic score: 6.5\" \
+    \"A stylish woman walking in the street of Tokyo. aesthetic score: 7.0\"
+}
+
+# vbench has 950 samples
+
+VBENCH_BS=1
+VBENCH_H=360
+VBENCH_W=640
+
+function run_vbench() {
+  if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then
+    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+      --prompt-path assets/texts/VBench/all_dimension.txt \
+      --image-size $VBENCH_H $VBENCH_W \
+      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+  else
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_dimension.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATI --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
+  fi
+}
+
+# vbench-i2v has 1120 samples
+
+VBENCH_I2V_H=360
+VBENCH_I2V_W=360
+
+function run_vbench_i2v() {
+  if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then
+    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+      --prompt-path assets/texts/VBench/all_i2v.txt \
+      --image-size $VBENCH_I2V_H $VBENCH_I2V_W \
+      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+  else
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_i2v.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
+  fi
+}
+
+### Main
+
+for arg in "$@"; do
+  # image
+  if [[ "$arg" = -1 ]] || [[ "$arg" = --image ]]; then
+    echo "Running image samples..."
+    run_image
+  fi
+  if [[ "$arg" = -2a ]] || [[ "$arg" = --video ]]; then
+    echo "Running video samples a..."
+    run_video_a
+  fi
+  if [[ "$arg" = -2b ]] || [[ "$arg" = --video ]]; then
+    echo "Running video samples b..."
+    run_video_b
+  fi
+  if [[ "$arg" = -2c ]] || [[ "$arg" = --video ]]; then
+    echo "Running video samples c..."
+    run_video_c
+  fi
+  # vbench
+  if [[ "$arg" = -4 ]] || [[ "$arg" = --vbench ]]; then
+    echo "Running vbench samples ..."
+    if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then
+      echo "need to set start_index and end_index"
+    else
+      run_vbench $VBENCH_START_INDEX $VBENCH_END_INDEX
+    fi
+  fi
+  # vbench-i2v
+  if [[ "$arg" = -5 ]] || [[ "$arg" = --vbench-i2v ]]; then
+    echo "Running vbench-i2v samples ..."
+    if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then
+      echo "need to set start_index and end_index"
+    else
+      run_vbench_i2v $VBENCH_START_INDEX $VBENCH_END_INDEX
+    fi
+  fi
+done
+
+### End
+
+end=$(date +%s)
+
+runtime=$((end - start))
+
+echo "Runtime: $runtime seconds"
diff --git a/eval/vae/cal_flolpips.py b/eval/vae/cal_flolpips.py
new file mode 100644
index 0000000..a8824da
--- /dev/null
+++ b/eval/vae/cal_flolpips.py
@@ -0,0 +1,89 @@
+import sys
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+sys.path.append(".")
+
+from flolpips.flolpips import FloLPIPS
+from flolpips.pwcnet import Network as PWCNet
+
+loss_fn = FloLPIPS(net="alex", version="0.1").eval().requires_grad_(False)
+flownet = PWCNet().eval().requires_grad_(False)
+
+
+def trans(x):
+    return x
+
+
+def calculate_flolpips(videos1, videos2, device):
+    global loss_fn, flownet
+
+    print("calculate_flowlpips...")
+    loss_fn = loss_fn.to(device)
+    flownet = flownet.to(device)
+
+    if videos1.shape != videos2.shape:
+        print("Warning: the shape of videos are not equal.")
+        min_frames = min(videos1.shape[1], videos2.shape[1])
+        videos1 = videos1[:, :min_frames]
+        videos2 = videos2[:, :min_frames]
+
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+
+    flolpips_results = []
+    for video_num in tqdm(range(videos1.shape[0])):
+        video1 = videos1[video_num].to(device)
+        video2 = videos2[video_num].to(device)
+        frames_rec = video1[:-1]
+        frames_rec_next = video1[1:]
+        frames_gt = video2[:-1]
+        frames_gt_next = video2[1:]
+        t, c, h, w = frames_gt.shape
+        flow_gt = flownet(frames_gt, frames_gt_next)
+        flow_dis = flownet(frames_rec, frames_rec_next)
+        flow_diff = flow_gt - flow_dis
+        flolpips = loss_fn.forward(frames_gt, frames_rec, flow_diff, normalize=True)
+        flolpips_results.append(flolpips.cpu().numpy().tolist())
+
+    flolpips_results = np.array(flolpips_results)  # [batch_size, num_frames]
+    flolpips = {}
+    flolpips_std = {}
+
+    for clip_timestamp in range(flolpips_results.shape[1]):
+        flolpips[clip_timestamp] = np.mean(flolpips_results[:, clip_timestamp], axis=-1)
+        flolpips_std[clip_timestamp] = np.std(flolpips_results[:, clip_timestamp], axis=-1)
+
+    result = {
+        "value": flolpips,
+        "value_std": flolpips_std,
+        "video_setting": video1.shape,
+        "video_setting_name": "time, channel, heigth, width",
+        "result": flolpips_results,
+        "details": flolpips_results.tolist(),
+    }
+
+    return result
+
+
+# test code / using example
+
+
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+
+    import json
+
+    result = calculate_flolpips(videos1, videos2, "cuda:0")
+    print(json.dumps(result, indent=4))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/vae/cal_lpips.py b/eval/vae/cal_lpips.py
new file mode 100644
index 0000000..8f0b7a6
--- /dev/null
+++ b/eval/vae/cal_lpips.py
@@ -0,0 +1,99 @@
+import lpips
+import numpy as np
+import torch
+from tqdm import tqdm
+
+spatial = True  # Return a spatial map of perceptual distance.
+
+# Linearly calibrated models (LPIPS)
+loss_fn = lpips.LPIPS(net="alex", spatial=spatial)  # Can also set net = 'squeeze' or 'vgg'
+# loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg'
+
+
+def trans(x):
+    # if greyscale images add channel
+    if x.shape[-3] == 1:
+        x = x.repeat(1, 1, 3, 1, 1)
+
+    # value range [0, 1] -> [-1, 1]
+    x = x * 2 - 1
+
+    return x
+
+
+def calculate_lpips(videos1, videos2, device):
+    # image should be RGB, IMPORTANT: normalized to [-1,1]
+    print("calculate_lpips...")
+
+    assert videos1.shape == videos2.shape
+
+    # videos [batch_size, timestamps, channel, h, w]
+
+    # support grayscale input, if grayscale -> channel*3
+    # value range [0, 1] -> [-1, 1]
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+
+    lpips_results = []
+
+    for video_num in tqdm(range(videos1.shape[0])):
+        # get a video
+        # video [timestamps, channel, h, w]
+        video1 = videos1[video_num]
+        video2 = videos2[video_num]
+
+        lpips_results_of_a_video = []
+        for clip_timestamp in range(len(video1)):
+            # get a img
+            # img [timestamps[x], channel, h, w]
+            # img [channel, h, w] tensor
+
+            img1 = video1[clip_timestamp].unsqueeze(0).to(device)
+            img2 = video2[clip_timestamp].unsqueeze(0).to(device)
+
+            loss_fn.to(device)
+
+            # calculate lpips of a video
+            lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist())
+        lpips_results.append(lpips_results_of_a_video)
+
+    lpips_results = np.array(lpips_results)
+
+    lpips = {}
+    lpips_std = {}
+
+    for clip_timestamp in range(len(video1)):
+        lpips[clip_timestamp] = np.mean(lpips_results[:, clip_timestamp])
+        lpips_std[clip_timestamp] = np.std(lpips_results[:, clip_timestamp])
+
+    result = {
+        "value": lpips,
+        "value_std": lpips_std,
+        "video_setting": video1.shape,
+        "video_setting_name": "time, channel, heigth, width",
+    }
+
+    return result
+
+
+# test code / using example
+
+
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    device = torch.device("cuda")
+    # device = torch.device("cpu")
+
+    import json
+
+    result = calculate_lpips(videos1, videos2, device)
+    print(json.dumps(result, indent=4))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/vae/cal_psnr.py b/eval/vae/cal_psnr.py
new file mode 100644
index 0000000..13c30e5
--- /dev/null
+++ b/eval/vae/cal_psnr.py
@@ -0,0 +1,92 @@
+import math
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+
+def img_psnr(img1, img2):
+    # [0,1]
+    # compute mse
+    # mse = np.mean((img1-img2)**2)
+    mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
+    # compute psnr
+    if mse < 1e-10:
+        return 100
+    psnr = 20 * math.log10(1 / math.sqrt(mse))
+    return psnr
+
+
+def trans(x):
+    return x
+
+
+def calculate_psnr(videos1, videos2):
+    print("calculate_psnr...")
+
+    # videos [batch_size, timestamps, channel, h, w]
+
+    assert videos1.shape == videos2.shape
+
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+
+    psnr_results = []
+
+    for video_num in tqdm(range(videos1.shape[0])):
+        # get a video
+        # video [timestamps, channel, h, w]
+        video1 = videos1[video_num]
+        video2 = videos2[video_num]
+
+        psnr_results_of_a_video = []
+        for clip_timestamp in range(len(video1)):
+            # get a img
+            # img [timestamps[x], channel, h, w]
+            # img [channel, h, w] numpy
+
+            img1 = video1[clip_timestamp].numpy()
+            img2 = video2[clip_timestamp].numpy()
+
+            # calculate psnr of a video
+            psnr_results_of_a_video.append(img_psnr(img1, img2))
+
+        psnr_results.append(psnr_results_of_a_video)
+
+    psnr_results = np.array(psnr_results)  # [batch_size, num_frames]
+    psnr = {}
+    psnr_std = {}
+
+    for clip_timestamp in range(len(video1)):
+        psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp])
+        psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp])
+
+    result = {
+        "value": psnr,
+        "value_std": psnr_std,
+        "video_setting": video1.shape,
+        "video_setting_name": "time, channel, heigth, width",
+    }
+
+    return result
+
+
+# test code / using example
+
+
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+
+    import json
+
+    result = calculate_psnr(videos1, videos2)
+    print(json.dumps(result, indent=4))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/vae/cal_ssim.py b/eval/vae/cal_ssim.py
new file mode 100644
index 0000000..65338c8
--- /dev/null
+++ b/eval/vae/cal_ssim.py
@@ -0,0 +1,119 @@
+import cv2
+import numpy as np
+import torch
+from tqdm import tqdm
+
+
+def ssim(img1, img2):
+    C1 = 0.01**2
+    C2 = 0.03**2
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    return ssim_map.mean()
+
+
+def calculate_ssim_function(img1, img2):
+    # [0,1]
+    # ssim is the only metric extremely sensitive to gray being compared to b/w
+    if not img1.shape == img2.shape:
+        raise ValueError("Input images must have the same dimensions.")
+    if img1.ndim == 2:
+        return ssim(img1, img2)
+    elif img1.ndim == 3:
+        if img1.shape[0] == 3:
+            ssims = []
+            for i in range(3):
+                ssims.append(ssim(img1[i], img2[i]))
+            return np.array(ssims).mean()
+        elif img1.shape[0] == 1:
+            return ssim(np.squeeze(img1), np.squeeze(img2))
+    else:
+        raise ValueError("Wrong input image dimensions.")
+
+
+def trans(x):
+    return x
+
+
+def calculate_ssim(videos1, videos2):
+    print("calculate_ssim...")
+
+    # videos [batch_size, timestamps, channel, h, w]
+
+    assert videos1.shape == videos2.shape
+
+    videos1 = trans(videos1)
+    videos2 = trans(videos2)
+
+    ssim_results = []
+
+    for video_num in tqdm(range(videos1.shape[0])):
+        # get a video
+        # video [timestamps, channel, h, w]
+        video1 = videos1[video_num]
+        video2 = videos2[video_num]
+
+        ssim_results_of_a_video = []
+        for clip_timestamp in range(len(video1)):
+            # get a img
+            # img [timestamps[x], channel, h, w]
+            # img [channel, h, w] numpy
+
+            img1 = video1[clip_timestamp].numpy()
+            img2 = video2[clip_timestamp].numpy()
+
+            # calculate ssim of a video
+            ssim_results_of_a_video.append(calculate_ssim_function(img1, img2))
+
+        ssim_results.append(ssim_results_of_a_video)
+
+    ssim_results = np.array(ssim_results)
+
+    ssim = {}
+    ssim_std = {}
+
+    for clip_timestamp in range(len(video1)):
+        ssim[clip_timestamp] = np.mean(ssim_results[:, clip_timestamp])
+        ssim_std[clip_timestamp] = np.std(ssim_results[:, clip_timestamp])
+
+    result = {
+        "value": ssim,
+        "value_std": ssim_std,
+        "video_setting": video1.shape,
+        "video_setting_name": "time, channel, heigth, width",
+    }
+
+    return result
+
+
+# test code / using example
+
+
+def main():
+    NUMBER_OF_VIDEOS = 8
+    VIDEO_LENGTH = 50
+    CHANNEL = 3
+    SIZE = 64
+    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
+    torch.device("cuda")
+
+    import json
+
+    result = calculate_ssim(videos1, videos2)
+    print(json.dumps(result, indent=4))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/vae/eval_common_metric.py b/eval/vae/eval_common_metric.py
new file mode 100644
index 0000000..f76bc9b
--- /dev/null
+++ b/eval/vae/eval_common_metric.py
@@ -0,0 +1,265 @@
+"""Calculates the CLIP Scores
+
+The CLIP model is a contrasitively learned language-image model. There is
+an image encoder and a text encoder. It is believed that the CLIP model could
+measure the similarity of cross modalities. Please find more information from
+https://github.com/openai/CLIP.
+
+The CLIP Score measures the Cosine Similarity between two embedded features.
+This repository utilizes the pretrained CLIP Model to calculate
+the mean average of cosine similarities.
+
+See --help to see further details.
+
+Code apapted from https://github.com/mseitzer/pytorch-fid and https://github.com/openai/CLIP.
+
+Copyright 2023 The Hong Kong Polytechnic University
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import json
+import os
+import os.path as osp
+import sys
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from decord import VideoReader, cpu
+from pytorchvideo.transforms import ShortSideScale
+from torch.utils.data import DataLoader, Dataset, Subset
+from torchvision.datasets.folder import pil_loader
+from torchvision.transforms import Compose, Lambda
+from torchvision.transforms._transforms_video import CenterCropVideo
+
+sys.path.append(".")
+from cal_flolpips import calculate_flolpips
+from cal_lpips import calculate_lpips
+from cal_psnr import calculate_psnr
+from cal_ssim import calculate_ssim
+
+try:
+    from tqdm import tqdm
+except ImportError:
+    # If tqdm is not available, provide a mock version of it
+    def tqdm(x):
+        return x
+
+
+class VideoDataset(Dataset):
+    def __init__(
+        self,
+        type,  # image or video
+        real_video_dir,
+        generated_video_dir,
+        num_frames,
+        sample_rate=1,
+        crop_size=None,
+        resolution=128,
+    ) -> None:
+        super().__init__()
+        self.type = type
+        self.real_video_files = self._combine_without_prefix(real_video_dir)
+        self.generated_video_files = self._combine_without_prefix(generated_video_dir)
+        self.num_frames = num_frames
+        self.sample_rate = sample_rate
+        self.crop_size = crop_size
+        self.short_size = resolution
+
+    def __len__(self):
+        return len(self.real_video_files)
+
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError
+
+        real_video_file = self.real_video_files[index]
+        generated_video_file = self.generated_video_files[index]
+        print(real_video_file, generated_video_file)
+
+        if self.type == "video":
+            real_video_tensor = self._load_video(real_video_file)
+            generated_video_tensor = self._load_video(generated_video_file)
+        else:
+            real_video_tensor = self._load_image(real_video_file)
+            generated_video_tensor = self._load_image(generated_video_file)
+
+        return {"real": real_video_tensor, "generated": generated_video_tensor}
+
+    def _load_image(self, image_path):
+        image = pil_loader(image_path)
+        transform = transforms.Compose([transforms.ToTensor()])
+        image = transform(image)
+        video = image.unsqueeze(0)
+        video = video.permute(1, 0, 2, 3)  # TCHW -> CTHW
+        return _preprocess(video, short_size=self.short_size, crop_size=self.crop_size)
+
+    def _load_video(self, video_path):
+        num_frames = self.num_frames
+        sample_rate = self.sample_rate
+        decord_vr = VideoReader(video_path, ctx=cpu(0))
+        total_frames = len(decord_vr)
+        sample_frames_len = sample_rate * num_frames
+
+        if total_frames >= sample_frames_len:
+            s = 0
+            e = s + sample_frames_len
+            num_frames = num_frames
+        else:
+            s = 0
+            e = total_frames
+            num_frames = int(total_frames / sample_frames_len * num_frames)
+            print(
+                f"sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}",
+                video_path,
+                total_frames,
+            )
+
+        frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
+        video_data = decord_vr.get_batch(frame_id_list).asnumpy()
+        video_data = torch.from_numpy(video_data)
+        video_data = video_data.permute(0, 3, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
+        return _preprocess(video_data, short_size=self.short_size, crop_size=self.crop_size)
+
+    def _combine_without_prefix(self, folder_path, prefix="."):
+        folder = []
+        os.makedirs(folder_path, exist_ok=True)
+        for name in os.listdir(folder_path):
+            if name[0] == prefix:
+                continue
+            if osp.isfile(osp.join(folder_path, name)):
+                folder.append(osp.join(folder_path, name))
+        folder.sort()
+        return folder
+
+
+def _preprocess(video_data, short_size=128, crop_size=None):
+    transform = Compose(
+        [
+            Lambda(lambda x: x / 255.0),
+            ShortSideScale(size=short_size),
+            CenterCropVideo(crop_size=crop_size),
+        ]
+    )
+    video_outputs = transform(video_data)
+    # video_outputs = torch.unsqueeze(video_outputs, 0) # (bz,c,t,h,w)
+    return video_outputs
+
+
+def calculate_common_metric(args, dataloader, device):
+    metric_dict = {}
+    if type(args.metric) is str:
+        args.metric = [m.strip() for m in args.metric.split(",")]
+    print(args.metric)
+    for metric in args.metric:
+        score_list = []
+        for batch_data in tqdm(dataloader):  # {'real': real_video_tensor, 'generated':generated_video_tensor }
+            real_videos = batch_data["real"]
+            generated_videos = batch_data["generated"]
+            assert real_videos.shape[2] == generated_videos.shape[2]
+            if metric == "ssim":
+                tmp_list = list(calculate_ssim(real_videos, generated_videos)["value"].values())
+            elif metric == "psnr":
+                tmp_list = list(calculate_psnr(real_videos, generated_videos)["value"].values())
+            elif metric == "flolpips":
+                result = calculate_flolpips(real_videos, generated_videos, args.device)
+                tmp_list = list(result["value"].values())
+            elif metric == "lpips":
+                tmp_list = list(calculate_lpips(real_videos, generated_videos, args.device)["value"].values())
+            else:
+                print(f"metric {metric} is not in acceped list, not calculated")
+                continue
+            score_list += tmp_list
+        metric_dict[metric] = np.mean(score_list)
+
+    return metric_dict
+
+
+def main():
+    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--type", type=str, choices=["video", "image"], default="video", help="whether evaluating images or videos"
+    )
+    parser.add_argument("--batch_size", type=int, default=2, help="Batch size to use")
+    parser.add_argument("--real_video_dir", type=str, help=("the path of real videos`"))
+    parser.add_argument("--generated_video_dir", type=str, help=("the path of generated videos`"))
+    parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=8,
+        help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"),
+    )
+    parser.add_argument("--sample_fps", type=int, default=30)
+    parser.add_argument("--resolution", type=int, default=336)
+    parser.add_argument("--crop_size", type=int, default=None)
+    parser.add_argument("--num_frames", type=int, default=100)
+    parser.add_argument("--sample_rate", type=int, default=1)
+    parser.add_argument("--subset_size", type=int, default=None)
+    # parser.add_argument("--metric", type=str, default="fvd",choices=['fvd','psnr','ssim','lpips', 'flolpips'])
+    parser.add_argument("--metric", nargs="+", default=[])
+    parser.add_argument("--fvd_method", type=str, default="styleganv", choices=["styleganv", "videogpt"])
+    parser.add_argument("--res_dir", type=str, default=None, help="dir to save result json")
+    args = parser.parse_args()
+
+    if args.device is None:
+        device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
+    else:
+        device = torch.device(args.device)
+
+    if args.num_workers is None:
+        try:
+            num_cpus = len(os.sched_getaffinity(0))
+        except AttributeError:
+            # os.sched_getaffinity is not available under Windows, use
+            # os.cpu_count instead (which may not return the *available* number
+            # of CPUs).
+            num_cpus = os.cpu_count()
+
+        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
+    else:
+        num_workers = args.num_workers
+
+    dataset = VideoDataset(
+        args.type,
+        args.real_video_dir,
+        args.generated_video_dir,
+        num_frames=args.num_frames,
+        sample_rate=args.sample_rate,
+        crop_size=args.crop_size,
+        resolution=args.resolution,
+    )
+
+    if args.subset_size:
+        indices = range(args.subset_size)
+        dataset = Subset(dataset, indices=indices)
+
+    dataloader = DataLoader(dataset, args.batch_size, num_workers=num_workers, pin_memory=True)
+
+    metric_score = calculate_common_metric(args, dataloader, device)
+    for k, v in metric_score.items():
+        metric_score[k] = round(v, 3)
+    print("metric: ", args.metric, " ", metric_score)
+
+    if args.res_dir:
+        output_file_path = os.path.join(
+            args.res_dir, "metric_" + str(args.num_frames) + "f_" + str(args.resolution) + "res.json"
+        )
+        with open(output_file_path, "w") as outfile:
+            json.dump(metric_score, outfile, indent=4, sort_keys=True)
+        print(f"metric results saved to: {output_file_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/vae/flolpips/correlation/correlation.py b/eval/vae/flolpips/correlation/correlation.py
new file mode 100644
index 0000000..6f6d6bd
--- /dev/null
+++ b/eval/vae/flolpips/correlation/correlation.py
@@ -0,0 +1,461 @@
+#!/usr/bin/env python
+
+import re
+
+import cupy
+import torch
+
+kernel_Correlation_rearrange = """
+	extern "C" __global__ void kernel_Correlation_rearrange(
+		const int n,
+		const float* input,
+		float* output
+	) {
+	  int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	  if (intIndex >= n) {
+	    return;
+	  }
+
+	  int intSample = blockIdx.z;
+	  int intChannel = blockIdx.y;
+
+	  float fltValue = input[(((intSample * SIZE_1(input)) + intChannel) * SIZE_2(input) * SIZE_3(input)) + intIndex];
+
+	  __syncthreads();
+
+	  int intPaddedY = (intIndex / SIZE_3(input)) + 4;
+	  int intPaddedX = (intIndex % SIZE_3(input)) + 4;
+	  int intRearrange = ((SIZE_3(input) + 8) * intPaddedY) + intPaddedX;
+
+	  output[(((intSample * SIZE_1(output) * SIZE_2(output)) + intRearrange) * SIZE_1(input)) + intChannel] = fltValue;
+	}
+"""
+
+kernel_Correlation_updateOutput = """
+	extern "C" __global__ void kernel_Correlation_updateOutput(
+	  const int n,
+	  const float* rbot0,
+	  const float* rbot1,
+	  float* top
+	) {
+	  extern __shared__ char patch_data_char[];
+
+	  float *patch_data = (float *)patch_data_char;
+
+	  // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1
+	  int x1 = blockIdx.x + 4;
+	  int y1 = blockIdx.y + 4;
+	  int item = blockIdx.z;
+	  int ch_off = threadIdx.x;
+
+	  // Load 3D patch into shared shared memory
+	  for (int j = 0; j < 1; j++) { // HEIGHT
+	    for (int i = 0; i < 1; i++) { // WIDTH
+	      int ji_off = (j + i) * SIZE_3(rbot0);
+	      for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS
+	        int idx1 = ((item * SIZE_1(rbot0) + y1+j) * SIZE_2(rbot0) + x1+i) * SIZE_3(rbot0) + ch;
+	        int idxPatchData = ji_off + ch;
+	        patch_data[idxPatchData] = rbot0[idx1];
+	      }
+	    }
+	  }
+
+	  __syncthreads();
+
+	  __shared__ float sum[32];
+
+	  // Compute correlation
+	  for (int top_channel = 0; top_channel < SIZE_1(top); top_channel++) {
+	    sum[ch_off] = 0;
+
+	    int s2o = top_channel % 9 - 4;
+	    int s2p = top_channel / 9 - 4;
+
+	    for (int j = 0; j < 1; j++) { // HEIGHT
+	      for (int i = 0; i < 1; i++) { // WIDTH
+	        int ji_off = (j + i) * SIZE_3(rbot0);
+	        for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS
+	          int x2 = x1 + s2o;
+	          int y2 = y1 + s2p;
+
+	          int idxPatchData = ji_off + ch;
+	          int idx2 = ((item * SIZE_1(rbot0) + y2+j) * SIZE_2(rbot0) + x2+i) * SIZE_3(rbot0) + ch;
+
+	          sum[ch_off] += patch_data[idxPatchData] * rbot1[idx2];
+	        }
+	      }
+	    }
+
+	    __syncthreads();
+
+	    if (ch_off == 0) {
+	      float total_sum = 0;
+	      for (int idx = 0; idx < 32; idx++) {
+	        total_sum += sum[idx];
+	      }
+	      const int sumelems = SIZE_3(rbot0);
+	      const int index = ((top_channel*SIZE_2(top) + blockIdx.y)*SIZE_3(top))+blockIdx.x;
+	      top[index + item*SIZE_1(top)*SIZE_2(top)*SIZE_3(top)] = total_sum / (float)sumelems;
+	    }
+	  }
+	}
+"""
+
+kernel_Correlation_updateGradFirst = """
+	#define ROUND_OFF 50000
+
+	extern "C" __global__ void kernel_Correlation_updateGradFirst(
+	  const int n,
+	  const int intSample,
+	  const float* rbot0,
+	  const float* rbot1,
+	  const float* gradOutput,
+	  float* gradFirst,
+	  float* gradSecond
+	) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
+	  int n = intIndex % SIZE_1(gradFirst); // channels
+	  int l = (intIndex / SIZE_1(gradFirst)) % SIZE_3(gradFirst) + 4; // w-pos
+	  int m = (intIndex / SIZE_1(gradFirst) / SIZE_3(gradFirst)) % SIZE_2(gradFirst) + 4; // h-pos
+
+	  // round_off is a trick to enable integer division with ceil, even for negative numbers
+	  // We use a large offset, for the inner part not to become negative.
+	  const int round_off = ROUND_OFF;
+	  const int round_off_s1 = round_off;
+
+	  // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
+	  int xmin = (l - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4)
+	  int ymin = (m - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4)
+
+	  // Same here:
+	  int xmax = (l - 4 + round_off_s1) - round_off; // floor (l - 4)
+	  int ymax = (m - 4 + round_off_s1) - round_off; // floor (m - 4)
+
+	  float sum = 0;
+	  if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) {
+	    xmin = max(0,xmin);
+	    xmax = min(SIZE_3(gradOutput)-1,xmax);
+
+	    ymin = max(0,ymin);
+	    ymax = min(SIZE_2(gradOutput)-1,ymax);
+
+	    for (int p = -4; p <= 4; p++) {
+	      for (int o = -4; o <= 4; o++) {
+	        // Get rbot1 data:
+	        int s2o = o;
+	        int s2p = p;
+	        int idxbot1 = ((intSample * SIZE_1(rbot0) + (m+s2p)) * SIZE_2(rbot0) + (l+s2o)) * SIZE_3(rbot0) + n;
+	        float bot1tmp = rbot1[idxbot1]; // rbot1[l+s2o,m+s2p,n]
+
+	        // Index offset for gradOutput in following loops:
+	        int op = (p+4) * 9 + (o+4); // index[o,p]
+	        int idxopoffset = (intSample * SIZE_1(gradOutput) + op);
+
+	        for (int y = ymin; y <= ymax; y++) {
+	          for (int x = xmin; x <= xmax; x++) {
+	            int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p]
+	            sum += gradOutput[idxgradOutput] * bot1tmp;
+	          }
+	        }
+	      }
+	    }
+	  }
+	  const int sumelems = SIZE_1(gradFirst);
+	  const int bot0index = ((n * SIZE_2(gradFirst)) + (m-4)) * SIZE_3(gradFirst) + (l-4);
+	  gradFirst[bot0index + intSample*SIZE_1(gradFirst)*SIZE_2(gradFirst)*SIZE_3(gradFirst)] = sum / (float)sumelems;
+	} }
+"""
+
+kernel_Correlation_updateGradSecond = """
+	#define ROUND_OFF 50000
+
+	extern "C" __global__ void kernel_Correlation_updateGradSecond(
+	  const int n,
+	  const int intSample,
+	  const float* rbot0,
+	  const float* rbot1,
+	  const float* gradOutput,
+	  float* gradFirst,
+	  float* gradSecond
+	) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
+	  int n = intIndex % SIZE_1(gradSecond); // channels
+	  int l = (intIndex / SIZE_1(gradSecond)) % SIZE_3(gradSecond) + 4; // w-pos
+	  int m = (intIndex / SIZE_1(gradSecond) / SIZE_3(gradSecond)) % SIZE_2(gradSecond) + 4; // h-pos
+
+	  // round_off is a trick to enable integer division with ceil, even for negative numbers
+	  // We use a large offset, for the inner part not to become negative.
+	  const int round_off = ROUND_OFF;
+	  const int round_off_s1 = round_off;
+
+	  float sum = 0;
+	  for (int p = -4; p <= 4; p++) {
+	    for (int o = -4; o <= 4; o++) {
+	      int s2o = o;
+	      int s2p = p;
+
+	      //Get X,Y ranges and clamp
+	      // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
+	      int xmin = (l - 4 - s2o + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o)
+	      int ymin = (m - 4 - s2p + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o)
+
+	      // Same here:
+	      int xmax = (l - 4 - s2o + round_off_s1) - round_off; // floor (l - 4 - s2o)
+	      int ymax = (m - 4 - s2p + round_off_s1) - round_off; // floor (m - 4 - s2p)
+
+	      if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) {
+	        xmin = max(0,xmin);
+	        xmax = min(SIZE_3(gradOutput)-1,xmax);
+
+	        ymin = max(0,ymin);
+	        ymax = min(SIZE_2(gradOutput)-1,ymax);
+
+	        // Get rbot0 data:
+	        int idxbot0 = ((intSample * SIZE_1(rbot0) + (m-s2p)) * SIZE_2(rbot0) + (l-s2o)) * SIZE_3(rbot0) + n;
+	        float bot0tmp = rbot0[idxbot0]; // rbot1[l+s2o,m+s2p,n]
+
+	        // Index offset for gradOutput in following loops:
+	        int op = (p+4) * 9 + (o+4); // index[o,p]
+	        int idxopoffset = (intSample * SIZE_1(gradOutput) + op);
+
+	        for (int y = ymin; y <= ymax; y++) {
+	          for (int x = xmin; x <= xmax; x++) {
+	            int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p]
+	            sum += gradOutput[idxgradOutput] * bot0tmp;
+	          }
+	        }
+	      }
+	    }
+	  }
+	  const int sumelems = SIZE_1(gradSecond);
+	  const int bot1index = ((n * SIZE_2(gradSecond)) + (m-4)) * SIZE_3(gradSecond) + (l-4);
+	  gradSecond[bot1index + intSample*SIZE_1(gradSecond)*SIZE_2(gradSecond)*SIZE_3(gradSecond)] = sum / (float)sumelems;
+	} }
+"""
+
+
+def cupy_kernel(strFunction, objVariables):
+    strKernel = globals()[strFunction]
+
+    while True:
+        objMatch = re.search("(SIZE_)([0-4])(\()([^\)]*)(\))", strKernel)
+
+        if objMatch is None:
+            break
+        # end
+
+        intArg = int(objMatch.group(2))
+
+        strTensor = objMatch.group(4)
+        intSizes = objVariables[strTensor].size()
+
+        strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg]))
+    # end
+
+    while True:
+        objMatch = re.search("(VALUE_)([0-4])(\()([^\)]+)(\))", strKernel)
+
+        if objMatch is None:
+            break
+        # end
+
+        intArgs = int(objMatch.group(2))
+        strArgs = objMatch.group(4).split(",")
+
+        strTensor = strArgs[0]
+        intStrides = objVariables[strTensor].stride()
+        strIndex = [
+            "(("
+            + strArgs[intArg + 1].replace("{", "(").replace("}", ")").strip()
+            + ")*"
+            + str(intStrides[intArg])
+            + ")"
+            for intArg in range(intArgs)
+        ]
+
+        strKernel = strKernel.replace(objMatch.group(0), strTensor + "[" + str.join("+", strIndex) + "]")
+    # end
+
+    return strKernel
+
+
+# end
+
+
+@cupy.memoize(for_each_device=True)
+def cupy_launch(strFunction, strKernel):
+    return cupy.RawKernel(strKernel, strFunction)
+
+
+# end
+
+
+class _FunctionCorrelation(torch.autograd.Function):
+    @staticmethod
+    def forward(self, first, second):
+        rbot0 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]])
+        rbot1 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]])
+
+        self.save_for_backward(first, second, rbot0, rbot1)
+
+        first = first.contiguous()
+        assert first.is_cuda == True
+        second = second.contiguous()
+        assert second.is_cuda == True
+
+        output = first.new_zeros([first.shape[0], 81, first.shape[2], first.shape[3]])
+
+        if first.is_cuda == True:
+            n = first.shape[2] * first.shape[3]
+            cupy_launch(
+                "kernel_Correlation_rearrange",
+                cupy_kernel("kernel_Correlation_rearrange", {"input": first, "output": rbot0}),
+            )(
+                grid=tuple([int((n + 16 - 1) / 16), first.shape[1], first.shape[0]]),
+                block=tuple([16, 1, 1]),
+                args=[n, first.data_ptr(), rbot0.data_ptr()],
+            )
+
+            n = second.shape[2] * second.shape[3]
+            cupy_launch(
+                "kernel_Correlation_rearrange",
+                cupy_kernel("kernel_Correlation_rearrange", {"input": second, "output": rbot1}),
+            )(
+                grid=tuple([int((n + 16 - 1) / 16), second.shape[1], second.shape[0]]),
+                block=tuple([16, 1, 1]),
+                args=[n, second.data_ptr(), rbot1.data_ptr()],
+            )
+
+            n = output.shape[1] * output.shape[2] * output.shape[3]
+            cupy_launch(
+                "kernel_Correlation_updateOutput",
+                cupy_kernel("kernel_Correlation_updateOutput", {"rbot0": rbot0, "rbot1": rbot1, "top": output}),
+            )(
+                grid=tuple([output.shape[3], output.shape[2], output.shape[0]]),
+                block=tuple([32, 1, 1]),
+                shared_mem=first.shape[1] * 4,
+                args=[n, rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr()],
+            )
+
+        elif first.is_cuda == False:
+            raise NotImplementedError()
+
+        # end
+
+        return output
+
+    # end
+
+    @staticmethod
+    def backward(self, gradOutput):
+        first, second, rbot0, rbot1 = self.saved_tensors
+
+        gradOutput = gradOutput.contiguous()
+        assert gradOutput.is_cuda == True
+
+        gradFirst = (
+            first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]])
+            if self.needs_input_grad[0] == True
+            else None
+        )
+        gradSecond = (
+            first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]])
+            if self.needs_input_grad[1] == True
+            else None
+        )
+
+        if first.is_cuda == True:
+            if gradFirst is not None:
+                for intSample in range(first.shape[0]):
+                    n = first.shape[1] * first.shape[2] * first.shape[3]
+                    cupy_launch(
+                        "kernel_Correlation_updateGradFirst",
+                        cupy_kernel(
+                            "kernel_Correlation_updateGradFirst",
+                            {
+                                "rbot0": rbot0,
+                                "rbot1": rbot1,
+                                "gradOutput": gradOutput,
+                                "gradFirst": gradFirst,
+                                "gradSecond": None,
+                            },
+                        ),
+                    )(
+                        grid=tuple([int((n + 512 - 1) / 512), 1, 1]),
+                        block=tuple([512, 1, 1]),
+                        args=[
+                            n,
+                            intSample,
+                            rbot0.data_ptr(),
+                            rbot1.data_ptr(),
+                            gradOutput.data_ptr(),
+                            gradFirst.data_ptr(),
+                            None,
+                        ],
+                    )
+                # end
+            # end
+
+            if gradSecond is not None:
+                for intSample in range(first.shape[0]):
+                    n = first.shape[1] * first.shape[2] * first.shape[3]
+                    cupy_launch(
+                        "kernel_Correlation_updateGradSecond",
+                        cupy_kernel(
+                            "kernel_Correlation_updateGradSecond",
+                            {
+                                "rbot0": rbot0,
+                                "rbot1": rbot1,
+                                "gradOutput": gradOutput,
+                                "gradFirst": None,
+                                "gradSecond": gradSecond,
+                            },
+                        ),
+                    )(
+                        grid=tuple([int((n + 512 - 1) / 512), 1, 1]),
+                        block=tuple([512, 1, 1]),
+                        args=[
+                            n,
+                            intSample,
+                            rbot0.data_ptr(),
+                            rbot1.data_ptr(),
+                            gradOutput.data_ptr(),
+                            None,
+                            gradSecond.data_ptr(),
+                        ],
+                    )
+                # end
+            # end
+
+        elif first.is_cuda == False:
+            raise NotImplementedError()
+
+        # end
+
+        return gradFirst, gradSecond
+
+    # end
+
+
+# end
+
+
+def FunctionCorrelation(tenFirst, tenSecond):
+    return _FunctionCorrelation.apply(tenFirst, tenSecond)
+
+
+# end
+
+
+class ModuleCorrelation(torch.nn.Module):
+    def __init__(self):
+        super(ModuleCorrelation, self).__init__()
+
+    # end
+
+    def forward(self, tenFirst, tenSecond):
+        return _FunctionCorrelation.apply(tenFirst, tenSecond)
+
+    # end
+
+
+# end
diff --git a/eval/vae/flolpips/flolpips.py b/eval/vae/flolpips/flolpips.py
new file mode 100644
index 0000000..ca75f1d
--- /dev/null
+++ b/eval/vae/flolpips/flolpips.py
@@ -0,0 +1,412 @@
+from __future__ import absolute_import
+
+import hashlib
+import os
+
+import requests
+import torch
+import torch.nn
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from tqdm import tqdm
+
+from .pretrained_networks import alexnet, squeezenet, vgg16
+from .pwcnet import Network as PWCNet
+from .utils import *
+
+URL_MAP = {"alex": "https://raw.githubusercontent.com/danier97/flolpips/main/weights/v0.1/alex.pth"}
+
+CKPT_MAP = {"alex": "alex.pth"}
+
+MD5_MAP = {"alex": "9642209e2b57a85d20f86d812320f9e6"}
+
+
+def spatial_average(in_tens, keepdim=True):
+    return in_tens.mean([2, 3], keepdim=keepdim)
+
+
+def mw_spatial_average(in_tens, flow, keepdim=True):
+    _, _, h, w = in_tens.shape
+    flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
+    flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2)
+    flow_mag = flow_mag / torch.sum(flow_mag, dim=[1, 2, 3], keepdim=True)
+    return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim)
+
+
+def mtw_spatial_average(in_tens, flow, texture, keepdim=True):
+    _, _, h, w = in_tens.shape
+    flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
+    texture = F.interpolate(texture, (h, w), align_corners=False, mode="bilinear")
+    flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2)
+    flow_mag = (flow_mag - flow_mag.min()) / (flow_mag.max() - flow_mag.min()) + 1e-6
+    texture = (texture - texture.min()) / (texture.max() - texture.min()) + 1e-6
+    weight = flow_mag / texture
+    weight /= torch.sum(weight)
+    return torch.sum(in_tens * weight, dim=[2, 3], keepdim=keepdim)
+
+
+def m2w_spatial_average(in_tens, flow, keepdim=True):
+    _, _, h, w = in_tens.shape
+    flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
+    flow_mag = flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2  # B,1,H,W
+    flow_mag = flow_mag / torch.sum(flow_mag)
+    return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim)
+
+
+def upsample(in_tens, out_HW=(64, 64)):  # assumes scale factor is same for H and W
+    in_H, in_W = in_tens.shape[2], in_tens.shape[3]
+    return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens)
+
+
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+
+
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+
+
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+
+
+# Learned perceptual metric
+class LPIPS(nn.Module):
+    def __init__(
+        self,
+        pretrained=True,
+        net="alex",
+        version="0.1",
+        lpips=True,
+        spatial=False,
+        pnet_rand=False,
+        pnet_tune=False,
+        use_dropout=True,
+        model_path=None,
+        eval_mode=True,
+        verbose=False,
+    ):
+        # lpips - [True] means with linear calibration on top of base network
+        # pretrained - [True] means load linear weights
+
+        super(LPIPS, self).__init__()
+        if verbose:
+            print(
+                "Setting up [%s] perceptual loss: trunk [%s], v[%s], spatial [%s]"
+                % ("LPIPS" if lpips else "baseline", net, version, "on" if spatial else "off")
+            )
+
+        self.pnet_type = net
+        self.pnet_tune = pnet_tune
+        self.pnet_rand = pnet_rand
+        self.spatial = spatial
+        self.lpips = lpips  # false means baseline of just averaging all layers
+        self.version = version
+        self.scaling_layer = ScalingLayer()
+
+        if self.pnet_type in ["vgg", "vgg16"]:
+            net_type = vgg16
+            self.chns = [64, 128, 256, 512, 512]
+        elif self.pnet_type == "alex":
+            net_type = alexnet
+            self.chns = [64, 192, 384, 256, 256]
+        elif self.pnet_type == "squeeze":
+            net_type = squeezenet
+            self.chns = [64, 128, 256, 384, 384, 512, 512]
+        self.L = len(self.chns)
+
+        self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
+
+        if lpips:
+            self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+            self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+            self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+            self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+            self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+            self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+            if self.pnet_type == "squeeze":  # 7 layers for squeezenet
+                self.lin5 = NetLinLayer(self.chns[5], use_dropout=use_dropout)
+                self.lin6 = NetLinLayer(self.chns[6], use_dropout=use_dropout)
+                self.lins += [self.lin5, self.lin6]
+            self.lins = nn.ModuleList(self.lins)
+
+            if pretrained:
+                self.load_from_pretrained(version, net)
+                if verbose:
+                    print("Loaded model from: %s" % model_path)
+
+        if eval_mode:
+            self.eval()
+
+    def load_from_pretrained(self, version, net):
+        ckpt = get_ckpt_path(net, "pretrained_models/flolpips/weights/v%s" % (version))
+        self.load_state_dict(torch.load(ckpt, map_location="cpu"), strict=False)
+
+    def forward(self, in0, in1, retPerLayer=False, normalize=False):
+        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
+            in0 = 2 * in0 - 1
+            in1 = 2 * in1 - 1
+
+        # v0.0 - original release had a bug, where input was not scaled
+        in0_input, in1_input = (
+            (self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1)
+        )
+        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+
+        for kk in range(self.L):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+
+        if self.lpips:
+            if self.spatial:
+                res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)]
+            else:
+                res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
+        else:
+            if self.spatial:
+                res = [upsample(diffs[kk].sum(dim=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)]
+            else:
+                res = [spatial_average(diffs[kk].sum(dim=1, keepdim=True), keepdim=True) for kk in range(self.L)]
+
+        # val = res[0]
+        # for l in range(1,self.L):
+        #     val += res[l]
+        #     print(val)
+
+        # a = spatial_average(self.lins[kk](diffs[kk]), keepdim=True)
+        # b = torch.max(self.lins[kk](feats0[kk]**2))
+        # for kk in range(self.L):
+        #     a += spatial_average(self.lins[kk](diffs[kk]), keepdim=True)
+        #     b = torch.max(b,torch.max(self.lins[kk](feats0[kk]**2)))
+        # a = a/self.L
+        # from IPython import embed
+        # embed()
+        # return 10*torch.log10(b/a)
+
+        # if(retPerLayer):
+        #     return (val, res)
+        # else:
+        return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False)
+
+
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None])
+        self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None])
+
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+
+
+class NetLinLayer(nn.Module):
+    """A single linear layer which does a 1x1 conv"""
+
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+
+        layers = (
+            [
+                nn.Dropout(),
+            ]
+            if (use_dropout)
+            else []
+        )
+        layers += [
+            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
+        ]
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class Dist2LogitLayer(nn.Module):
+    """takes 2 distances, puts through fc layers, spits out value between [0,1] (if use_sigmoid is True)"""
+
+    def __init__(self, chn_mid=32, use_sigmoid=True):
+        super(Dist2LogitLayer, self).__init__()
+
+        layers = [
+            nn.Conv2d(5, chn_mid, 1, stride=1, padding=0, bias=True),
+        ]
+        layers += [
+            nn.LeakyReLU(0.2, True),
+        ]
+        layers += [
+            nn.Conv2d(chn_mid, chn_mid, 1, stride=1, padding=0, bias=True),
+        ]
+        layers += [
+            nn.LeakyReLU(0.2, True),
+        ]
+        layers += [
+            nn.Conv2d(chn_mid, 1, 1, stride=1, padding=0, bias=True),
+        ]
+        if use_sigmoid:
+            layers += [
+                nn.Sigmoid(),
+            ]
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, d0, d1, eps=0.1):
+        return self.model.forward(torch.cat((d0, d1, d0 - d1, d0 / (d1 + eps), d1 / (d0 + eps)), dim=1))
+
+
+class BCERankingLoss(nn.Module):
+    def __init__(self, chn_mid=32):
+        super(BCERankingLoss, self).__init__()
+        self.net = Dist2LogitLayer(chn_mid=chn_mid)
+        # self.parameters = list(self.net.parameters())
+        self.loss = torch.nn.BCELoss()
+
+    def forward(self, d0, d1, judge):
+        per = (judge + 1.0) / 2.0
+        self.logit = self.net.forward(d0, d1)
+        return self.loss(self.logit, per)
+
+
+# L2, DSSIM metrics
+class FakeNet(nn.Module):
+    def __init__(self, use_gpu=True, colorspace="Lab"):
+        super(FakeNet, self).__init__()
+        self.use_gpu = use_gpu
+        self.colorspace = colorspace
+
+
+class L2(FakeNet):
+    def forward(self, in0, in1, retPerLayer=None):
+        assert in0.size()[0] == 1  # currently only supports batchSize 1
+
+        if self.colorspace == "RGB":
+            (N, C, X, Y) = in0.size()
+            value = torch.mean(
+                torch.mean(torch.mean((in0 - in1) ** 2, dim=1).view(N, 1, X, Y), dim=2).view(N, 1, 1, Y), dim=3
+            ).view(N)
+            return value
+        elif self.colorspace == "Lab":
+            value = l2(
+                tensor2np(tensor2tensorlab(in0.data, to_norm=False)),
+                tensor2np(tensor2tensorlab(in1.data, to_norm=False)),
+                range=100.0,
+            ).astype("float")
+            ret_var = Variable(torch.Tensor((value,)))
+            if self.use_gpu:
+                ret_var = ret_var.cuda()
+            return ret_var
+
+
+class DSSIM(FakeNet):
+    def forward(self, in0, in1, retPerLayer=None):
+        assert in0.size()[0] == 1  # currently only supports batchSize 1
+
+        if self.colorspace == "RGB":
+            value = dssim(1.0 * tensor2im(in0.data), 1.0 * tensor2im(in1.data), range=255.0).astype("float")
+        elif self.colorspace == "Lab":
+            value = dssim(
+                tensor2np(tensor2tensorlab(in0.data, to_norm=False)),
+                tensor2np(tensor2tensorlab(in1.data, to_norm=False)),
+                range=100.0,
+            ).astype("float")
+        ret_var = Variable(torch.Tensor((value,)))
+        if self.use_gpu:
+            ret_var = ret_var.cuda()
+        return ret_var
+
+
+def print_network(net):
+    num_params = 0
+    for param in net.parameters():
+        num_params += param.numel()
+    print("Network", net)
+    print("Total number of parameters: %d" % num_params)
+
+
+class FloLPIPS(LPIPS):
+    def __init__(
+        self,
+        pretrained=True,
+        net="alex",
+        version="0.1",
+        lpips=True,
+        spatial=False,
+        pnet_rand=False,
+        pnet_tune=False,
+        use_dropout=True,
+        model_path=None,
+        eval_mode=True,
+        verbose=False,
+    ):
+        super(FloLPIPS, self).__init__(
+            pretrained, net, version, lpips, spatial, pnet_rand, pnet_tune, use_dropout, model_path, eval_mode, verbose
+        )
+
+    def forward(self, in0, in1, flow, retPerLayer=False, normalize=False):
+        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
+            in0 = 2 * in0 - 1
+            in1 = 2 * in1 - 1
+
+        in0_input, in1_input = (
+            (self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1)
+        )
+        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+
+        for kk in range(self.L):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+
+        res = [mw_spatial_average(self.lins[kk](diffs[kk]), flow, keepdim=True) for kk in range(self.L)]
+
+        return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False)
+
+
+class Flolpips(nn.Module):
+    def __init__(self):
+        super(Flolpips, self).__init__()
+        self.loss_fn = FloLPIPS(net="alex", version="0.1")
+        self.flownet = PWCNet()
+
+    @torch.no_grad()
+    def forward(self, I0, I1, frame_dis, frame_ref):
+        """
+        args:
+            I0: first frame of the triplet, shape: [B, C, H, W]
+            I1: third frame of the triplet, shape: [B, C, H, W]
+            frame_dis: prediction of the intermediate frame, shape: [B, C, H, W]
+            frame_ref: ground-truth of the intermediate frame, shape: [B, C, H, W]
+        """
+        assert (
+            I0.size() == I1.size() == frame_dis.size() == frame_ref.size()
+        ), "the 4 input tensors should have same size"
+
+        flow_ref = self.flownet(frame_ref, I0)
+        flow_dis = self.flownet(frame_dis, I0)
+        flow_diff = flow_ref - flow_dis
+        flolpips_wrt_I0 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True)
+
+        flow_ref = self.flownet(frame_ref, I1)
+        flow_dis = self.flownet(frame_dis, I1)
+        flow_diff = flow_ref - flow_dis
+        flolpips_wrt_I1 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True)
+
+        flolpips = (flolpips_wrt_I0 + flolpips_wrt_I1) / 2
+        return flolpips
diff --git a/eval/vae/flolpips/pretrained_networks.py b/eval/vae/flolpips/pretrained_networks.py
new file mode 100644
index 0000000..5915b05
--- /dev/null
+++ b/eval/vae/flolpips/pretrained_networks.py
@@ -0,0 +1,182 @@
+from collections import namedtuple
+
+import torch
+from torchvision import models as tv
+
+
+class squeezenet(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(squeezenet, self).__init__()
+        pretrained_features = tv.squeezenet1_1(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.slice6 = torch.nn.Sequential()
+        self.slice7 = torch.nn.Sequential()
+        self.N_slices = 7
+        for x in range(2):
+            self.slice1.add_module(str(x), pretrained_features[x])
+        for x in range(2, 5):
+            self.slice2.add_module(str(x), pretrained_features[x])
+        for x in range(5, 8):
+            self.slice3.add_module(str(x), pretrained_features[x])
+        for x in range(8, 10):
+            self.slice4.add_module(str(x), pretrained_features[x])
+        for x in range(10, 11):
+            self.slice5.add_module(str(x), pretrained_features[x])
+        for x in range(11, 12):
+            self.slice6.add_module(str(x), pretrained_features[x])
+        for x in range(12, 13):
+            self.slice7.add_module(str(x), pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1 = h
+        h = self.slice2(h)
+        h_relu2 = h
+        h = self.slice3(h)
+        h_relu3 = h
+        h = self.slice4(h)
+        h_relu4 = h
+        h = self.slice5(h)
+        h_relu5 = h
+        h = self.slice6(h)
+        h_relu6 = h
+        h = self.slice7(h)
+        h_relu7 = h
+        vgg_outputs = namedtuple("SqueezeOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5", "relu6", "relu7"])
+        out = vgg_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7)
+
+        return out
+
+
+class alexnet(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(alexnet, self).__init__()
+        alexnet_pretrained_features = tv.alexnet(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(2):
+            self.slice1.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(2, 5):
+            self.slice2.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(5, 8):
+            self.slice3.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(8, 10):
+            self.slice4.add_module(str(x), alexnet_pretrained_features[x])
+        for x in range(10, 12):
+            self.slice5.add_module(str(x), alexnet_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1 = h
+        h = self.slice2(h)
+        h_relu2 = h
+        h = self.slice3(h)
+        h_relu3 = h
+        h = self.slice4(h)
+        h_relu4 = h
+        h = self.slice5(h)
+        h_relu5 = h
+        alexnet_outputs = namedtuple("AlexnetOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5"])
+        out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5)
+
+        return out
+
+
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = tv.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+
+        return out
+
+
+class resnet(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True, num=18):
+        super(resnet, self).__init__()
+        if num == 18:
+            self.net = tv.resnet18(pretrained=pretrained)
+        elif num == 34:
+            self.net = tv.resnet34(pretrained=pretrained)
+        elif num == 50:
+            self.net = tv.resnet50(pretrained=pretrained)
+        elif num == 101:
+            self.net = tv.resnet101(pretrained=pretrained)
+        elif num == 152:
+            self.net = tv.resnet152(pretrained=pretrained)
+        self.N_slices = 5
+
+        self.conv1 = self.net.conv1
+        self.bn1 = self.net.bn1
+        self.relu = self.net.relu
+        self.maxpool = self.net.maxpool
+        self.layer1 = self.net.layer1
+        self.layer2 = self.net.layer2
+        self.layer3 = self.net.layer3
+        self.layer4 = self.net.layer4
+
+    def forward(self, X):
+        h = self.conv1(X)
+        h = self.bn1(h)
+        h = self.relu(h)
+        h_relu1 = h
+        h = self.maxpool(h)
+        h = self.layer1(h)
+        h_conv2 = h
+        h = self.layer2(h)
+        h_conv3 = h
+        h = self.layer3(h)
+        h_conv4 = h
+        h = self.layer4(h)
+        h_conv5 = h
+
+        outputs = namedtuple("Outputs", ["relu1", "conv2", "conv3", "conv4", "conv5"])
+        out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5)
+
+        return out
diff --git a/eval/vae/flolpips/pwcnet.py b/eval/vae/flolpips/pwcnet.py
new file mode 100644
index 0000000..3abc0ed
--- /dev/null
+++ b/eval/vae/flolpips/pwcnet.py
@@ -0,0 +1,472 @@
+#!/usr/bin/env python
+
+import math
+
+import torch
+
+# try:
+from .correlation import correlation  # the custom cost volume layer
+
+# except:
+# 	sys.path.insert(0, './correlation'); import correlation # you should consider upgrading python
+# end
+
+##########################################################
+
+# assert(int(str('').join(torch.__version__.split('.')[0:2])) >= 13) # requires at least pytorch version 1.3.0
+
+# torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance
+
+# torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance
+
+# ##########################################################
+
+# arguments_strModel = 'default' # 'default', or 'chairs-things'
+# arguments_strFirst = './images/first.png'
+# arguments_strSecond = './images/second.png'
+# arguments_strOut = './out.flo'
+
+# for strOption, strArgument in getopt.getopt(sys.argv[1:], '', [ strParameter[2:] + '=' for strParameter in sys.argv[1::2] ])[0]:
+# 	if strOption == '--model' and strArgument != '': arguments_strModel = strArgument # which model to use
+# 	if strOption == '--first' and strArgument != '': arguments_strFirst = strArgument # path to the first frame
+# 	if strOption == '--second' and strArgument != '': arguments_strSecond = strArgument # path to the second frame
+# 	if strOption == '--out' and strArgument != '': arguments_strOut = strArgument # path to where the output should be stored
+# end
+
+##########################################################
+
+
+def backwarp(tenInput, tenFlow):
+    backwarp_tenGrid = {}
+    backwarp_tenPartial = {}
+    if str(tenFlow.shape) not in backwarp_tenGrid:
+        tenHor = (
+            torch.linspace(-1.0 + (1.0 / tenFlow.shape[3]), 1.0 - (1.0 / tenFlow.shape[3]), tenFlow.shape[3])
+            .view(1, 1, 1, -1)
+            .expand(-1, -1, tenFlow.shape[2], -1)
+        )
+        tenVer = (
+            torch.linspace(-1.0 + (1.0 / tenFlow.shape[2]), 1.0 - (1.0 / tenFlow.shape[2]), tenFlow.shape[2])
+            .view(1, 1, -1, 1)
+            .expand(-1, -1, -1, tenFlow.shape[3])
+        )
+
+        backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([tenHor, tenVer], 1).cuda()
+    # end
+
+    if str(tenFlow.shape) not in backwarp_tenPartial:
+        backwarp_tenPartial[str(tenFlow.shape)] = tenFlow.new_ones(
+            [tenFlow.shape[0], 1, tenFlow.shape[2], tenFlow.shape[3]]
+        )
+    # end
+
+    tenFlow = torch.cat(
+        [
+            tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
+            tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0),
+        ],
+        1,
+    )
+    tenInput = torch.cat([tenInput, backwarp_tenPartial[str(tenFlow.shape)]], 1)
+
+    tenOutput = torch.nn.functional.grid_sample(
+        input=tenInput,
+        grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1),
+        mode="bilinear",
+        padding_mode="zeros",
+        align_corners=False,
+    )
+
+    tenMask = tenOutput[:, -1:, :, :]
+    tenMask[tenMask > 0.999] = 1.0
+    tenMask[tenMask < 1.0] = 0.0
+
+    return tenOutput[:, :-1, :, :] * tenMask
+
+
+# end
+
+##########################################################
+
+
+class Network(torch.nn.Module):
+    def __init__(self):
+        super(Network, self).__init__()
+
+        class Extractor(torch.nn.Module):
+            def __init__(self):
+                super(Extractor, self).__init__()
+
+                self.netOne = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netTwo = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netThr = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netFou = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netFiv = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netSix = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=128, out_channels=196, kernel_size=3, stride=2, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+            # end
+
+            def forward(self, tenInput):
+                tenOne = self.netOne(tenInput)
+                tenTwo = self.netTwo(tenOne)
+                tenThr = self.netThr(tenTwo)
+                tenFou = self.netFou(tenThr)
+                tenFiv = self.netFiv(tenFou)
+                tenSix = self.netSix(tenFiv)
+
+                return [tenOne, tenTwo, tenThr, tenFou, tenFiv, tenSix]
+
+            # end
+
+        # end
+
+        class Decoder(torch.nn.Module):
+            def __init__(self, intLevel):
+                super(Decoder, self).__init__()
+
+                intPrevious = [
+                    None,
+                    None,
+                    81 + 32 + 2 + 2,
+                    81 + 64 + 2 + 2,
+                    81 + 96 + 2 + 2,
+                    81 + 128 + 2 + 2,
+                    81,
+                    None,
+                ][intLevel + 1]
+                intCurrent = [
+                    None,
+                    None,
+                    81 + 32 + 2 + 2,
+                    81 + 64 + 2 + 2,
+                    81 + 96 + 2 + 2,
+                    81 + 128 + 2 + 2,
+                    81,
+                    None,
+                ][intLevel + 0]
+
+                if intLevel < 6:
+                    self.netUpflow = torch.nn.ConvTranspose2d(
+                        in_channels=2, out_channels=2, kernel_size=4, stride=2, padding=1
+                    )
+                if intLevel < 6:
+                    self.netUpfeat = torch.nn.ConvTranspose2d(
+                        in_channels=intPrevious + 128 + 128 + 96 + 64 + 32,
+                        out_channels=2,
+                        kernel_size=4,
+                        stride=2,
+                        padding=1,
+                    )
+                if intLevel < 6:
+                    self.fltBackwarp = [None, None, None, 5.0, 2.5, 1.25, 0.625, None][intLevel + 1]
+
+                self.netOne = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=intCurrent, out_channels=128, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netTwo = torch.nn.Sequential(
+                    torch.nn.Conv2d(in_channels=intCurrent + 128, out_channels=128, kernel_size=3, stride=1, padding=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netThr = torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels=intCurrent + 128 + 128, out_channels=96, kernel_size=3, stride=1, padding=1
+                    ),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netFou = torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels=intCurrent + 128 + 128 + 96, out_channels=64, kernel_size=3, stride=1, padding=1
+                    ),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netFiv = torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels=intCurrent + 128 + 128 + 96 + 64,
+                        out_channels=32,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                    ),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                )
+
+                self.netSix = torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels=intCurrent + 128 + 128 + 96 + 64 + 32,
+                        out_channels=2,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                    )
+                )
+
+            # end
+
+            def forward(self, tenFirst, tenSecond, objPrevious):
+                tenFlow = None
+                tenFeat = None
+
+                if objPrevious is None:
+                    tenFlow = None
+                    tenFeat = None
+
+                    tenVolume = torch.nn.functional.leaky_relu(
+                        input=correlation.FunctionCorrelation(tenFirst=tenFirst, tenSecond=tenSecond),
+                        negative_slope=0.1,
+                        inplace=False,
+                    )
+
+                    tenFeat = torch.cat([tenVolume], 1)
+
+                elif objPrevious is not None:
+                    tenFlow = self.netUpflow(objPrevious["tenFlow"])
+                    tenFeat = self.netUpfeat(objPrevious["tenFeat"])
+
+                    tenVolume = torch.nn.functional.leaky_relu(
+                        input=correlation.FunctionCorrelation(
+                            tenFirst=tenFirst,
+                            tenSecond=backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackwarp),
+                        ),
+                        negative_slope=0.1,
+                        inplace=False,
+                    )
+
+                    tenFeat = torch.cat([tenVolume, tenFirst, tenFlow, tenFeat], 1)
+
+                # end
+
+                tenFeat = torch.cat([self.netOne(tenFeat), tenFeat], 1)
+                tenFeat = torch.cat([self.netTwo(tenFeat), tenFeat], 1)
+                tenFeat = torch.cat([self.netThr(tenFeat), tenFeat], 1)
+                tenFeat = torch.cat([self.netFou(tenFeat), tenFeat], 1)
+                tenFeat = torch.cat([self.netFiv(tenFeat), tenFeat], 1)
+
+                tenFlow = self.netSix(tenFeat)
+
+                return {"tenFlow": tenFlow, "tenFeat": tenFeat}
+
+            # end
+
+        # end
+
+        class Refiner(torch.nn.Module):
+            def __init__(self):
+                super(Refiner, self).__init__()
+
+                self.netMain = torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels=81 + 32 + 2 + 2 + 128 + 128 + 96 + 64 + 32,
+                        out_channels=128,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        dilation=1,
+                    ),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2, dilation=2),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=4, dilation=4),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=128, out_channels=96, kernel_size=3, stride=1, padding=8, dilation=8),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=96, out_channels=64, kernel_size=3, stride=1, padding=16, dilation=16),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1, dilation=1),
+                    torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
+                    torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=3, stride=1, padding=1, dilation=1),
+                )
+
+            # end
+
+            def forward(self, tenInput):
+                return self.netMain(tenInput)
+
+            # end
+
+        # end
+
+        self.netExtractor = Extractor()
+
+        self.netTwo = Decoder(2)
+        self.netThr = Decoder(3)
+        self.netFou = Decoder(4)
+        self.netFiv = Decoder(5)
+        self.netSix = Decoder(6)
+
+        self.netRefiner = Refiner()
+
+        self.load_state_dict(
+            {
+                strKey.replace("module", "net"): tenWeight
+                for strKey, tenWeight in torch.hub.load_state_dict_from_url(
+                    url="http://content.sniklaus.com/github/pytorch-pwc/network-" + "default" + ".pytorch"
+                ).items()
+            }
+        )
+
+    # end
+
+    def forward(self, tenFirst, tenSecond):
+        intWidth = tenFirst.shape[3]
+        intHeight = tenFirst.shape[2]
+
+        intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0))
+        intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0))
+
+        tenPreprocessedFirst = torch.nn.functional.interpolate(
+            input=tenFirst, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False
+        )
+        tenPreprocessedSecond = torch.nn.functional.interpolate(
+            input=tenSecond, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False
+        )
+
+        tenFirst = self.netExtractor(tenPreprocessedFirst)
+        tenSecond = self.netExtractor(tenPreprocessedSecond)
+
+        objEstimate = self.netSix(tenFirst[-1], tenSecond[-1], None)
+        objEstimate = self.netFiv(tenFirst[-2], tenSecond[-2], objEstimate)
+        objEstimate = self.netFou(tenFirst[-3], tenSecond[-3], objEstimate)
+        objEstimate = self.netThr(tenFirst[-4], tenSecond[-4], objEstimate)
+        objEstimate = self.netTwo(tenFirst[-5], tenSecond[-5], objEstimate)
+
+        tenFlow = objEstimate["tenFlow"] + self.netRefiner(objEstimate["tenFeat"])
+        tenFlow = 20.0 * torch.nn.functional.interpolate(
+            input=tenFlow, size=(intHeight, intWidth), mode="bilinear", align_corners=False
+        )
+        tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth)
+        tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight)
+
+        return tenFlow
+
+    # end
+
+
+# end
+
+netNetwork = None
+
+##########################################################
+
+
+def estimate(tenFirst, tenSecond):
+    global netNetwork
+
+    if netNetwork is None:
+        netNetwork = Network().cuda().eval()
+    # end
+
+    assert tenFirst.shape[1] == tenSecond.shape[1]
+    assert tenFirst.shape[2] == tenSecond.shape[2]
+
+    intWidth = tenFirst.shape[2]
+    intHeight = tenFirst.shape[1]
+
+    assert (
+        intWidth == 1024
+    )  # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue
+    assert (
+        intHeight == 436
+    )  # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue
+
+    tenPreprocessedFirst = tenFirst.cuda().view(1, 3, intHeight, intWidth)
+    tenPreprocessedSecond = tenSecond.cuda().view(1, 3, intHeight, intWidth)
+
+    intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0))
+    intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0))
+
+    tenPreprocessedFirst = torch.nn.functional.interpolate(
+        input=tenPreprocessedFirst,
+        size=(intPreprocessedHeight, intPreprocessedWidth),
+        mode="bilinear",
+        align_corners=False,
+    )
+    tenPreprocessedSecond = torch.nn.functional.interpolate(
+        input=tenPreprocessedSecond,
+        size=(intPreprocessedHeight, intPreprocessedWidth),
+        mode="bilinear",
+        align_corners=False,
+    )
+
+    tenFlow = 20.0 * torch.nn.functional.interpolate(
+        input=netNetwork(tenPreprocessedFirst, tenPreprocessedSecond),
+        size=(intHeight, intWidth),
+        mode="bilinear",
+        align_corners=False,
+    )
+
+    tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth)
+    tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight)
+
+    return tenFlow[0, :, :, :].cpu()
+
+
+# end
+
+##########################################################
+
+# if __name__ == '__main__':
+# 	tenFirst = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strFirst))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))
+# 	tenSecond = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strSecond))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))
+
+# 	tenOutput = estimate(tenFirst, tenSecond)
+
+# 	objOutput = open(arguments_strOut, 'wb')
+
+# 	numpy.array([ 80, 73, 69, 72 ], numpy.uint8).tofile(objOutput)
+# 	numpy.array([ tenOutput.shape[2], tenOutput.shape[1] ], numpy.int32).tofile(objOutput)
+# 	numpy.array(tenOutput.numpy().transpose(1, 2, 0), numpy.float32).tofile(objOutput)
+
+# 	objOutput.close()
+# end
diff --git a/eval/vae/flolpips/utils.py b/eval/vae/flolpips/utils.py
new file mode 100644
index 0000000..21a3504
--- /dev/null
+++ b/eval/vae/flolpips/utils.py
@@ -0,0 +1,107 @@
+import cv2
+import numpy as np
+import torch
+
+
+def normalize_tensor(in_feat, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(in_feat**2, dim=1, keepdim=True))
+    return in_feat / (norm_factor + eps)
+
+
+def l2(p0, p1, range=255.0):
+    return 0.5 * np.mean((p0 / range - p1 / range) ** 2)
+
+
+def dssim(p0, p1, range=255.0):
+    from skimage.measure import compare_ssim
+
+    return (1 - compare_ssim(p0, p1, data_range=range, multichannel=True)) / 2.0
+
+
+def tensor2im(image_tensor, imtype=np.uint8, cent=1.0, factor=255.0 / 2.0):
+    image_numpy = image_tensor[0].cpu().float().numpy()
+    image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor
+    return image_numpy.astype(imtype)
+
+
+def tensor2np(tensor_obj):
+    # change dimension of a tensor object into a numpy array
+    return tensor_obj[0].cpu().float().numpy().transpose((1, 2, 0))
+
+
+def np2tensor(np_obj):
+    # change dimenion of np array into tensor array
+    return torch.Tensor(np_obj[:, :, :, np.newaxis].transpose((3, 2, 0, 1)))
+
+
+def tensor2tensorlab(image_tensor, to_norm=True, mc_only=False):
+    # image tensor to lab tensor
+    from skimage import color
+
+    img = tensor2im(image_tensor)
+    img_lab = color.rgb2lab(img)
+    if mc_only:
+        img_lab[:, :, 0] = img_lab[:, :, 0] - 50
+    if to_norm and not mc_only:
+        img_lab[:, :, 0] = img_lab[:, :, 0] - 50
+        img_lab = img_lab / 100.0
+
+    return np2tensor(img_lab)
+
+
+def read_frame_yuv2rgb(stream, width, height, iFrame, bit_depth, pix_fmt="420"):
+    if pix_fmt == "420":
+        multiplier = 1
+        uv_factor = 2
+    elif pix_fmt == "444":
+        multiplier = 2
+        uv_factor = 1
+    else:
+        print("Pixel format {} is not supported".format(pix_fmt))
+        return
+
+    if bit_depth == 8:
+        datatype = np.uint8
+        stream.seek(iFrame * 1.5 * width * height * multiplier)
+        Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width))
+
+        # read chroma samples and upsample since original is 4:2:0 sampling
+        U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
+            (height // uv_factor, width // uv_factor)
+        )
+        V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
+            (height // uv_factor, width // uv_factor)
+        )
+
+    else:
+        datatype = np.uint16
+        stream.seek(iFrame * 3 * width * height * multiplier)
+        Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width))
+
+        U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
+            (height // uv_factor, width // uv_factor)
+        )
+        V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
+            (height // uv_factor, width // uv_factor)
+        )
+
+    if pix_fmt == "420":
+        yuv = np.empty((height * 3 // 2, width), dtype=datatype)
+        yuv[0:height, :] = Y
+
+        yuv[height : height + height // 4, :] = U.reshape(-1, width)
+        yuv[height + height // 4 :, :] = V.reshape(-1, width)
+
+        if bit_depth != 8:
+            yuv = (yuv / (2**bit_depth - 1) * 255).astype(np.uint8)
+
+        # convert to rgb
+        rgb = cv2.cvtColor(yuv, cv2.COLOR_YUV2RGB_I420)
+
+    else:
+        yvu = np.stack([Y, V, U], axis=2)
+        if bit_depth != 8:
+            yvu = (yvu / (2**bit_depth - 1) * 255).astype(np.uint8)
+        rgb = cv2.cvtColor(yvu, cv2.COLOR_YCrCb2RGB)
+
+    return rgb
diff --git a/eval/vae/launch.sh b/eval/vae/launch.sh
new file mode 100644
index 0000000..9d71de8
--- /dev/null
+++ b/eval/vae/launch.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+CKPT_PATH=$1
+
+if [ -z $IMG_PATH ]; then
+    IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
+fi
+
+if [ -z $VID_PATH ]; then
+    VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+fi
+
+if [ -z $NUM_FRAMES ]; then
+    NUM_FRAMES=17
+fi
+
+if [ -z $FORCE_HUGGINGFACE ]; then
+    FORCE_HUGGINGFACE=False
+fi
+
+if [ -z $CKPT_PATH ]; then # huggingface model
+    STEP_RECORD=epoch0-global_step0
+    LOG_DIR=outputs/OpenSoraVAE_V1_3/eval
+    FORCE_HUGGINGFACE=True
+    CKPT_PATH=pretrained_models/OpenSoraVAE_V1_3
+else
+    if [[ -d $CKPT_PATH ]] ; then
+        STEP_RECORD=$(basename $CKPT_PATH)
+    elif [[ -f $CKPT_PATH ]]; then
+        STEP_RECORD=$(basename $(dirname $CKPT_PATH))
+    else
+        echo "$CKPT_PATH is not valid";
+        exit 1
+    fi
+    LOG_DIR=$(dirname $CKPT_PATH)/eval
+fi
+
+
+echo "saving losses and metrics to $LOG_DIR"
+echo "video path: ${VID_PATH}"
+mkdir -p $LOG_DIR
+
+# generate video, 256x256
+torchrun --standalone --nproc_per_node=1 scripts/inference_opensoravae_v1_3.py configs/vae_v1_3/inference/video_16z_256x256.py --data-path $VID_PATH --save-dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x256x256 --ckpt-path ${CKPT_PATH} --num-frames $NUM_FRAMES --force-huggingface ${FORCE_HUGGINGFACE}
+# calc metrics, 17x256x256
+python eval/vae/eval_common_metric.py --batch_size 4 --real_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x256x256_ori --generated_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x256x256_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames $NUM_FRAMES --sample_rate 1 --metric ssim psnr lpips flolpips --type video --res_dir ${LOG_DIR}
+
+# # generate video, 512x512
+torchrun --standalone --nproc_per_node=1 scripts/inference_opensoravae_v1_3.py configs/vae_v1_3/inference/video_16z_512x512.py --data-path $VID_PATH --save-dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x512x512 --ckpt-path ${CKPT_PATH} --num-frames $NUM_FRAMES --force-huggingface ${FORCE_HUGGINGFACE}
+# # calc metrics, 17x512x512
+python eval/vae/eval_common_metric.py --batch_size 4 --real_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x512x512_ori --generated_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x512x512_rec --device cuda --sample_fps 24 --crop_size 512 --resolution 512 --num_frames $NUM_FRAMES --sample_rate 1 --metric ssim psnr lpips flolpips --type video --res_dir ${LOG_DIR}
+
+# # generate image, 1024x1024
+torchrun --standalone --nproc_per_node=1 scripts/inference_opensoravae_v1_3.py configs/vae_v1_3/inference/image_16z.py --data-path $IMG_PATH --save-dir samples/opensoravae_v1_3/${STEP_RECORD}/1x1024x1024 --ckpt-path ${CKPT_PATH} --num-frames 1 --force-huggingface ${FORCE_HUGGINGFACE}
+# # calc metrics, 1x1024x1024
+python eval/vae/eval_common_metric.py --batch_size 4 --real_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/1x1024x1024_ori --generated_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/1x1024x1024_rec --device cuda --sample_fps 1 --crop_size 1024 --resolution 1024 --num_frames 1 --sample_rate 1 --metric ssim psnr lpips --type image --res_dir ${LOG_DIR}
diff --git a/eval/vae/script/eval.sh b/eval/vae/script/eval.sh
new file mode 100644
index 0000000..3716719
--- /dev/null
+++ b/eval/vae/script/eval.sh
@@ -0,0 +1,12 @@
+python eval/eval_common_metric.py \
+    --batch_size 2 \
+    --real_video_dir ../test_eval/release/origin \
+    --generated_video_dir ../test_eval/release \
+    --device cuda \
+    --sample_fps 10 \
+    --crop_size 256 \
+    --resolution 256 \
+    --num_frames 17 \
+    --sample_rate 1 \
+    --subset_size 100 \
+    --metric ssim psnr lpips flolpips
diff --git a/eval/vbench/VBench_full_info.json b/eval/vbench/VBench_full_info.json
new file mode 100644
index 0000000..e60c40e
--- /dev/null
+++ b/eval/vbench/VBench_full_info.json
@@ -0,0 +1,9132 @@
+[
+    {
+        "prompt_en": "In a still frame, a stop sign",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "a toilet, frozen in time",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "a laptop, frozen in time",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of alley",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of bar",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of barn",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of bathroom",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of bedroom",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of cliff",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, courtyard",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, gas station",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of house",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "indoor gymnasium, frozen in time",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of indoor library",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of kitchen",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of palace",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, parking lot",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, phone booth",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of restaurant",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of tower",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a bowl",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of an apple",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a bench",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a bed",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a chair",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a cup",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a dining table",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, a pear",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a bunch of grapes",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a bowl on the kitchen counter",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a beautiful, handcrafted ceramic bowl",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of an antique bowl",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of an exquisite mahogany dining table",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a wooden bench in the park",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, a park bench with a view of the lake",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a vintage rocking chair was placed on the porch",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of the phone booth was tucked away in a quiet alley",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved fa\u00e7ades",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a country estate's library featured elegant wooden shelves",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time",
+        "dimension": [
+            "temporal_flickering"
+        ]
+    },
+    {
+        "prompt_en": "a bird and a cat",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "bird and cat"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cat and a dog",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "cat and dog"
+            }
+        }
+    },
+    {
+        "prompt_en": "a dog and a horse",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "dog and horse"
+            }
+        }
+    },
+    {
+        "prompt_en": "a horse and a sheep",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "horse and sheep"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sheep and a cow",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "sheep and cow"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cow and an elephant",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "cow and elephant"
+            }
+        }
+    },
+    {
+        "prompt_en": "an elephant and a bear",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "elephant and bear"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bear and a zebra",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "bear and zebra"
+            }
+        }
+    },
+    {
+        "prompt_en": "a zebra and a giraffe",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "zebra and giraffe"
+            }
+        }
+    },
+    {
+        "prompt_en": "a giraffe and a bird",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "giraffe and bird"
+            }
+        }
+    },
+    {
+        "prompt_en": "a chair and a couch",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "chair and couch"
+            }
+        }
+    },
+    {
+        "prompt_en": "a couch and a potted plant",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "couch and potted plant"
+            }
+        }
+    },
+    {
+        "prompt_en": "a potted plant and a tv",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "potted plant and tv"
+            }
+        }
+    },
+    {
+        "prompt_en": "a tv and a laptop",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "tv and laptop"
+            }
+        }
+    },
+    {
+        "prompt_en": "a laptop and a remote",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "laptop and remote"
+            }
+        }
+    },
+    {
+        "prompt_en": "a remote and a keyboard",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "remote and keyboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a keyboard and a cell phone",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "keyboard and cell phone"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cell phone and a book",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "cell phone and book"
+            }
+        }
+    },
+    {
+        "prompt_en": "a book and a clock",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "book and clock"
+            }
+        }
+    },
+    {
+        "prompt_en": "a clock and a backpack",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "clock and backpack"
+            }
+        }
+    },
+    {
+        "prompt_en": "a backpack and an umbrella",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "backpack and umbrella"
+            }
+        }
+    },
+    {
+        "prompt_en": "an umbrella and a handbag",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "umbrella and handbag"
+            }
+        }
+    },
+    {
+        "prompt_en": "a handbag and a tie",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "handbag and tie"
+            }
+        }
+    },
+    {
+        "prompt_en": "a tie and a suitcase",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "tie and suitcase"
+            }
+        }
+    },
+    {
+        "prompt_en": "a suitcase and a vase",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "suitcase and vase"
+            }
+        }
+    },
+    {
+        "prompt_en": "a vase and scissors",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "vase and scissors"
+            }
+        }
+    },
+    {
+        "prompt_en": "scissors and a teddy bear",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "scissors and teddy bear"
+            }
+        }
+    },
+    {
+        "prompt_en": "a teddy bear and a frisbee",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "teddy bear and frisbee"
+            }
+        }
+    },
+    {
+        "prompt_en": "a frisbee and skis",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "frisbee and skis"
+            }
+        }
+    },
+    {
+        "prompt_en": "skis and a snowboard",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "skis and snowboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a snowboard and a sports ball",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "snowboard and sports ball"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sports ball and a kite",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "sports ball and kite"
+            }
+        }
+    },
+    {
+        "prompt_en": "a kite and a baseball bat",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "kite and baseball bat"
+            }
+        }
+    },
+    {
+        "prompt_en": "a baseball bat and a baseball glove",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "baseball bat and baseball glove"
+            }
+        }
+    },
+    {
+        "prompt_en": "a baseball glove and a skateboard",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "baseball glove and skateboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a skateboard and a surfboard",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "skateboard and surfboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a surfboard and a tennis racket",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "surfboard and tennis racket"
+            }
+        }
+    },
+    {
+        "prompt_en": "a tennis racket and a bottle",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "tennis racket and bottle"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bottle and a chair",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "bottle and chair"
+            }
+        }
+    },
+    {
+        "prompt_en": "an airplane and a train",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "airplane and train"
+            }
+        }
+    },
+    {
+        "prompt_en": "a train and a boat",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "train and boat"
+            }
+        }
+    },
+    {
+        "prompt_en": "a boat and an airplane",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "boat and airplane"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bicycle and a car",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "bicycle and car"
+            }
+        }
+    },
+    {
+        "prompt_en": "a car and a motorcycle",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "car and motorcycle"
+            }
+        }
+    },
+    {
+        "prompt_en": "a motorcycle and a bus",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "motorcycle and bus"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bus and a traffic light",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "bus and traffic light"
+            }
+        }
+    },
+    {
+        "prompt_en": "a traffic light and a fire hydrant",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "traffic light and fire hydrant"
+            }
+        }
+    },
+    {
+        "prompt_en": "a fire hydrant and a stop sign",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "fire hydrant and stop sign"
+            }
+        }
+    },
+    {
+        "prompt_en": "a stop sign and a parking meter",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "stop sign and parking meter"
+            }
+        }
+    },
+    {
+        "prompt_en": "a parking meter and a truck",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "parking meter and truck"
+            }
+        }
+    },
+    {
+        "prompt_en": "a truck and a bicycle",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "truck and bicycle"
+            }
+        }
+    },
+    {
+        "prompt_en": "a toilet and a hair drier",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "toilet and hair drier"
+            }
+        }
+    },
+    {
+        "prompt_en": "a hair drier and a toothbrush",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "hair drier and toothbrush"
+            }
+        }
+    },
+    {
+        "prompt_en": "a toothbrush and a sink",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "toothbrush and sink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sink and a toilet",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "sink and toilet"
+            }
+        }
+    },
+    {
+        "prompt_en": "a wine glass and a chair",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "wine glass and chair"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cup and a couch",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "cup and couch"
+            }
+        }
+    },
+    {
+        "prompt_en": "a fork and a potted plant",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "fork and potted plant"
+            }
+        }
+    },
+    {
+        "prompt_en": "a knife and a tv",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "knife and tv"
+            }
+        }
+    },
+    {
+        "prompt_en": "a spoon and a laptop",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "spoon and laptop"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bowl and a remote",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "bowl and remote"
+            }
+        }
+    },
+    {
+        "prompt_en": "a banana and a keyboard",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "banana and keyboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "an apple and a cell phone",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "apple and cell phone"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sandwich and a book",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "sandwich and book"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange and a clock",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "orange and clock"
+            }
+        }
+    },
+    {
+        "prompt_en": "broccoli and a backpack",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "broccoli and backpack"
+            }
+        }
+    },
+    {
+        "prompt_en": "a carrot and an umbrella",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "carrot and umbrella"
+            }
+        }
+    },
+    {
+        "prompt_en": "a hot dog and a handbag",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "hot dog and handbag"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pizza and a tie",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "pizza and tie"
+            }
+        }
+    },
+    {
+        "prompt_en": "a donut and a suitcase",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "donut and suitcase"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cake and a vase",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "cake and vase"
+            }
+        }
+    },
+    {
+        "prompt_en": "an oven and scissors",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "oven and scissors"
+            }
+        }
+    },
+    {
+        "prompt_en": "a toaster and a teddy bear",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "toaster and teddy bear"
+            }
+        }
+    },
+    {
+        "prompt_en": "a microwave and a frisbee",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "microwave and frisbee"
+            }
+        }
+    },
+    {
+        "prompt_en": "a refrigerator and skis",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "refrigerator and skis"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bicycle and an airplane",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "bicycle and airplane"
+            }
+        }
+    },
+    {
+        "prompt_en": "a car and a train",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "car and train"
+            }
+        }
+    },
+    {
+        "prompt_en": "a motorcycle and a boat",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "motorcycle and boat"
+            }
+        }
+    },
+    {
+        "prompt_en": "a person and a toilet",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "person and toilet"
+            }
+        }
+    },
+    {
+        "prompt_en": "a person and a hair drier",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "person and hair drier"
+            }
+        }
+    },
+    {
+        "prompt_en": "a person and a toothbrush",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "person and toothbrush"
+            }
+        }
+    },
+    {
+        "prompt_en": "a person and a sink",
+        "dimension": [
+            "multiple_objects"
+        ],
+        "auxiliary_info": {
+            "multiple_objects": {
+                "object": "person and sink"
+            }
+        }
+    },
+    {
+        "prompt_en": "A person is riding a bike",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is marching",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is roller skating",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is tasting beer",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is clapping",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is drawing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is petting animal (not cat)",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is eating watermelon",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is playing harp",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is wrestling",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is riding scooter",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is sweeping floor",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is skateboarding",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is dunking basketball",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is playing flute",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is stretching leg",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is tying tie",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is skydiving",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is shooting goal (soccer)",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is playing piano",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is finger snapping",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is canoeing or kayaking",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is laughing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is digging",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is clay pottery making",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is shooting basketball",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is bending back",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is shaking hands",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is bandaging",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is push up",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is catching or throwing frisbee",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is playing trumpet",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is flying kite",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is filling eyebrows",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is shuffling cards",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is folding clothes",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is smoking",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is tai chi",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is squat",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is playing controller",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is throwing axe",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is giving or receiving award",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is air drumming",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is taking a shower",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is planting trees",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is sharpening knives",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is robot dancing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is rock climbing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is hula hooping",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is writing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is bungee jumping",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is pushing cart",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is cleaning windows",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is cutting watermelon",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is cheerleading",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is washing hands",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is ironing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is cutting nails",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is hugging",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is trimming or shaving beard",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is jogging",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is making bed",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is washing dishes",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is grooming dog",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is doing laundry",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is knitting",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is reading book",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is baby waking up",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is massaging legs",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is brushing teeth",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is crawling baby",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is motorcycling",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is driving car",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is sticking tongue out",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is shaking head",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is sword fighting",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is doing aerobics",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is strumming guitar",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is riding or walking with horse",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is archery",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is catching or throwing baseball",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is playing chess",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is rock scissors paper",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is using computer",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is arranging flowers",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is bending metal",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is ice skating",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is climbing a rope",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is crying",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is dancing ballet",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is getting a haircut",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is running on treadmill",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is kissing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is counting money",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is barbequing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is peeling apples",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is milking cow",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is shining shoes",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is making snowman",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "A person is sailing",
+        "dimension": [
+            "human_action"
+        ]
+    },
+    {
+        "prompt_en": "a person swimming in ocean",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a person giving a presentation to a room full of colleagues",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a person washing the dishes",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a person eating a burger",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a person walking in the snowstorm",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a person drinking coffee in a cafe",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a person playing guitar",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bicycle leaning against a tree",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bicycle gliding through a snowy field",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bicycle slowing down to stop",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bicycle accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a car stuck in traffic during rush hour",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a car turning a corner",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a car slowing down to stop",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a car accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a motorcycle cruising along a coastal highway",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a motorcycle turning a corner",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a motorcycle slowing down to stop",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a motorcycle gliding through a snowy field",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a motorcycle accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "an airplane soaring through a clear blue sky",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "an airplane taking off",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "an airplane landing smoothly on a runway",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "an airplane accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bus turning a corner",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bus stuck in traffic during rush hour",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bus accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a train speeding down the tracks",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a train crossing over a tall bridge",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a train accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a truck turning a corner",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a truck anchored in a tranquil bay",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a truck stuck in traffic during rush hour",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a truck slowing down to stop",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a truck accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a boat sailing smoothly on a calm lake",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a boat slowing down to stop",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a boat accelerating to gain speed",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bird soaring gracefully in the sky",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bird building a nest from twigs and leaves",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bird flying over a snowy forest",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a cat grooming itself meticulously with its tongue",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a cat playing in park",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a cat drinking water",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a cat running happily",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a dog enjoying a peaceful walk",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a dog playing in park",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a dog drinking water",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a dog running happily",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a horse bending down to drink water from a river",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a horse galloping across an open field",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a horse taking a peaceful walk",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a horse running to join a herd of its kind",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a sheep bending down to drink water from a river",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a sheep taking a peaceful walk",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a sheep running to join a herd of its kind",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a cow bending down to drink water from a river",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a cow chewing cud while resting in a tranquil barn",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a cow running to join a herd of its kind",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "an elephant spraying itself with water using its trunk to cool down",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "an elephant taking a peaceful walk",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "an elephant running to join a herd of its kind",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bear catching a salmon in its powerful jaws",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bear sniffing the air for scents of food",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bear climbing a tree",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a bear hunting for prey",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a zebra bending down to drink water from a river",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a zebra running to join a herd of its kind",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a zebra taking a peaceful walk",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a giraffe bending down to drink water from a river",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a giraffe taking a peaceful walk",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a giraffe running to join a herd of its kind",
+        "dimension": [
+            "subject_consistency",
+            "dynamic_degree",
+            "motion_smoothness"
+        ]
+    },
+    {
+        "prompt_en": "a person",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "person"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bicycle",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bicycle"
+            }
+        }
+    },
+    {
+        "prompt_en": "a car",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "car"
+            }
+        }
+    },
+    {
+        "prompt_en": "a motorcycle",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "motorcycle"
+            }
+        }
+    },
+    {
+        "prompt_en": "an airplane",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "airplane"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bus",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bus"
+            }
+        }
+    },
+    {
+        "prompt_en": "a train",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "train"
+            }
+        }
+    },
+    {
+        "prompt_en": "a truck",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "truck"
+            }
+        }
+    },
+    {
+        "prompt_en": "a boat",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "boat"
+            }
+        }
+    },
+    {
+        "prompt_en": "a traffic light",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "traffic light"
+            }
+        }
+    },
+    {
+        "prompt_en": "a fire hydrant",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "fire hydrant"
+            }
+        }
+    },
+    {
+        "prompt_en": "a stop sign",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "stop sign"
+            }
+        }
+    },
+    {
+        "prompt_en": "a parking meter",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "parking meter"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bench",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bench"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bird",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bird"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cat",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "cat"
+            }
+        }
+    },
+    {
+        "prompt_en": "a dog",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "dog"
+            }
+        }
+    },
+    {
+        "prompt_en": "a horse",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "horse"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sheep",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "sheep"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cow",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "cow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an elephant",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "elephant"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bear",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bear"
+            }
+        }
+    },
+    {
+        "prompt_en": "a zebra",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "zebra"
+            }
+        }
+    },
+    {
+        "prompt_en": "a giraffe",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "giraffe"
+            }
+        }
+    },
+    {
+        "prompt_en": "a backpack",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "backpack"
+            }
+        }
+    },
+    {
+        "prompt_en": "an umbrella",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "umbrella"
+            }
+        }
+    },
+    {
+        "prompt_en": "a handbag",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "handbag"
+            }
+        }
+    },
+    {
+        "prompt_en": "a tie",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "tie"
+            }
+        }
+    },
+    {
+        "prompt_en": "a suitcase",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "suitcase"
+            }
+        }
+    },
+    {
+        "prompt_en": "a frisbee",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "frisbee"
+            }
+        }
+    },
+    {
+        "prompt_en": "skis",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "skis"
+            }
+        }
+    },
+    {
+        "prompt_en": "a snowboard",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "snowboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sports ball",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "sports ball"
+            }
+        }
+    },
+    {
+        "prompt_en": "a kite",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "kite"
+            }
+        }
+    },
+    {
+        "prompt_en": "a baseball bat",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "baseball bat"
+            }
+        }
+    },
+    {
+        "prompt_en": "a baseball glove",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "baseball glove"
+            }
+        }
+    },
+    {
+        "prompt_en": "a skateboard",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "skateboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a surfboard",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "surfboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a tennis racket",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "tennis racket"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bottle",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bottle"
+            }
+        }
+    },
+    {
+        "prompt_en": "a wine glass",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "wine glass"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cup",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "cup"
+            }
+        }
+    },
+    {
+        "prompt_en": "a fork",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "fork"
+            }
+        }
+    },
+    {
+        "prompt_en": "a knife",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "knife"
+            }
+        }
+    },
+    {
+        "prompt_en": "a spoon",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "spoon"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bowl",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bowl"
+            }
+        }
+    },
+    {
+        "prompt_en": "a banana",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "banana"
+            }
+        }
+    },
+    {
+        "prompt_en": "an apple",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "apple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sandwich",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "sandwich"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "broccoli",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "broccoli"
+            }
+        }
+    },
+    {
+        "prompt_en": "a carrot",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "carrot"
+            }
+        }
+    },
+    {
+        "prompt_en": "a hot dog",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "hot dog"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pizza",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "pizza"
+            }
+        }
+    },
+    {
+        "prompt_en": "a donut",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "donut"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cake",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "cake"
+            }
+        }
+    },
+    {
+        "prompt_en": "a chair",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "chair"
+            }
+        }
+    },
+    {
+        "prompt_en": "a couch",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "couch"
+            }
+        }
+    },
+    {
+        "prompt_en": "a potted plant",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "potted plant"
+            }
+        }
+    },
+    {
+        "prompt_en": "a bed",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "bed"
+            }
+        }
+    },
+    {
+        "prompt_en": "a dining table",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "dining table"
+            }
+        }
+    },
+    {
+        "prompt_en": "a toilet",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "toilet"
+            }
+        }
+    },
+    {
+        "prompt_en": "a tv",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "tv"
+            }
+        }
+    },
+    {
+        "prompt_en": "a laptop",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "laptop"
+            }
+        }
+    },
+    {
+        "prompt_en": "a remote",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "remote"
+            }
+        }
+    },
+    {
+        "prompt_en": "a keyboard",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "keyboard"
+            }
+        }
+    },
+    {
+        "prompt_en": "a cell phone",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "cell phone"
+            }
+        }
+    },
+    {
+        "prompt_en": "a microwave",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "microwave"
+            }
+        }
+    },
+    {
+        "prompt_en": "an oven",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "oven"
+            }
+        }
+    },
+    {
+        "prompt_en": "a toaster",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "toaster"
+            }
+        }
+    },
+    {
+        "prompt_en": "a sink",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "sink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a refrigerator",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "refrigerator"
+            }
+        }
+    },
+    {
+        "prompt_en": "a book",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "book"
+            }
+        }
+    },
+    {
+        "prompt_en": "a clock",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "clock"
+            }
+        }
+    },
+    {
+        "prompt_en": "a vase",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "vase"
+            }
+        }
+    },
+    {
+        "prompt_en": "scissors",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "scissors"
+            }
+        }
+    },
+    {
+        "prompt_en": "a teddy bear",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "teddy bear"
+            }
+        }
+    },
+    {
+        "prompt_en": "a hair drier",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "hair drier"
+            }
+        }
+    },
+    {
+        "prompt_en": "a toothbrush",
+        "dimension": [
+            "object_class"
+        ],
+        "auxiliary_info": {
+            "object_class": {
+                "object": "toothbrush"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white bicycle",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white car",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white bird",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black cat",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white cat",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange cat",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow cat",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white umbrella",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white suitcase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white bowl",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white chair",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white clock",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a red vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "red"
+            }
+        }
+    },
+    {
+        "prompt_en": "a green vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "green"
+            }
+        }
+    },
+    {
+        "prompt_en": "a blue vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "blue"
+            }
+        }
+    },
+    {
+        "prompt_en": "a yellow vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "yellow"
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "orange"
+            }
+        }
+    },
+    {
+        "prompt_en": "a purple vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "purple"
+            }
+        }
+    },
+    {
+        "prompt_en": "a pink vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "pink"
+            }
+        }
+    },
+    {
+        "prompt_en": "a black vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "black"
+            }
+        }
+    },
+    {
+        "prompt_en": "a white vase",
+        "dimension": [
+            "color"
+        ],
+        "auxiliary_info": {
+            "color": {
+                "color": "white"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "The bund Shanghai, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "An astronaut flying in space, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "Van Gogh style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "oil painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "by Hokusai, in the style of Ukiyo"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "black and white"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "pixel art"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "in cyberpunk style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "animated style"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "watercolor painting"
+            }
+        }
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style",
+        "dimension": [
+            "appearance_style"
+        ],
+        "auxiliary_info": {
+            "appearance_style": {
+                "appearance_style": "surrealism style"
+            }
+        }
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus",
+        "dimension": [
+            "temporal_style"
+        ]
+    },
+    {
+        "prompt_en": "Close up of grapes on a rotating table.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Turtle swimming in ocean.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A storm trooper vacuuming the beach.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A panda standing on a surfboard in the ocean in sunset.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut feeding ducks on a sunny afternoon, reflection from the water.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Two pandas discussing an academic paper.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Sunset time lapse at the beach with moving clouds and colors in the sky.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A fat rabbit wearing a purple robe walking through a fantasy landscape.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A koala bear playing piano in the forest.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut flying in space.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Fireworks.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An animated painting of fluffy white clouds moving in sky.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Flying through fantasy landscapes.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A bigfoot walking in the snowstorm.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A squirrel eating a burger.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A cat wearing sunglasses and working as a lifeguard at a pool.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Splash of turquoise water in extreme slow motion, alpha channel included.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "an ice cream is melting on the table.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "a drone flying over a snowy forest.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "a shark is swimming in the ocean.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Aerial panoramic video from a drone of a fantasy land.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "a teddy bear is swimming in the ocean.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "time lapse of sunrise on mars.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "golden fish swimming in the ocean.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An artist brush painting on a canvas close up.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A drone view of celebration with Christmas tree and fireworks, starry sky - background.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Campfire at night in a snowy forest with starry sky in the background.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "a fantasy landscape",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A 3D model of a 1800s victorian house.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "this is how I do makeup in the morning.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A raccoon that looks like a turtle, digital art.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Robot dancing in Times Square.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Busy freeway at night.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Balloon full of water exploding in extreme slow motion.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An astronaut is riding a horse in the space in a photorealistic style.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Sewing machine, old sewing machine working.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Vampire makeup face of beautiful girl, red contact lenses.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Ashtray full of butts on table, smoke flowing on black background, close-up",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Pacific coast, carmel by the sea ocean and waves.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A teddy bear is playing drum kit in NYC Times Square.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A corgi is playing drum kit.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An Iron man is playing the electronic guitar, high electronic guitar.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A raccoon is playing the electronic guitar.",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A corgi's head depicted as an explosion of a nebula",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A fantasy landscape",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A future where humans have achieved teleportation technology",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A jellyfish floating through the ocean, with bioluminescent tentacles",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A Mars rover moving on Mars",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A panda drinking coffee in a cafe in Paris",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A space shuttle launching into orbit, with flames and smoke billowing out from the engines",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A steam train moving on a mountainside",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A super cool giant robot in Cyberpunk Beijing",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Cinematic shot of Van Gogh's selfie, Van Gogh style",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Gwen Stacy reading a book",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Iron Man flying in the sky",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, oil painting",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Yoda playing guitar on the stage",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A car moving slowly on an empty street, rainy evening",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A cat eating food out of a bowl",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A cat wearing sunglasses at a pool",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A confused panda in calculus class",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A cute fluffy panda eating Chinese food in a restaurant",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A cute happy Corgi playing in park, sunset",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A cute raccoon playing guitar in a boat on the ocean",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A lightning striking atop of eiffel tower, dark clouds in the sky",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A modern art museum, with colorful paintings",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A panda cooking in the kitchen",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A panda playing on a swing set",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A polar bear is playing guitar",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A raccoon dressed in suit playing the trumpet, stage background",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A shark swimming in clear Caribbean ocean",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A super robot protecting city",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "A teddy bear washing the dishes",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An epic tornado attacking above a glowing city at night, the tornado is made of smoke",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Clown fish swimming through the coral reef",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Hyper-realistic spaceship landing on Mars",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "The bund Shanghai, vibrant color",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Vincent van Gogh is painting in the room",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "Yellow flowers swing in the wind",
+        "dimension": [
+            "overall_consistency",
+            "aesthetic_quality",
+            "imaging_quality"
+        ]
+    },
+    {
+        "prompt_en": "alley",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "alley"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "amusement park",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "amusement park"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "aquarium",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "aquarium"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "arch",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "arch"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "art gallery",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "art gallery"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "bathroom",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "bathroom"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "bakery shop",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "bakery shop"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "ballroom",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "ballroom"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "bar",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "bar"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "barn",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "barn"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "basement",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "basement"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "beach",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "beach"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "bedroom",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "bedroom"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "bridge",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "bridge"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "botanical garden",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "botanical garden"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "cafeteria",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "cafeteria"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "campsite",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "campsite"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "campus",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "campus"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "carrousel",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "carrousel"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "castle",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "castle"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "cemetery",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "cemetery"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "classroom",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "classroom"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "cliff",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "cliff"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "crosswalk",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "crosswalk"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "construction site",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "construction site"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "corridor",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "corridor"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "courtyard",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "courtyard"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "desert",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "desert"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "downtown",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "downtown"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "driveway",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "driveway"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "farm",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "farm"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "food court",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "food court"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "football field",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "football field"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "forest road",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "forest road"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "fountain",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "fountain"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "gas station",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "gas station"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "glacier",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "glacier"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "golf course",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "golf course"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "indoor gymnasium",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "indoor gymnasium"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "harbor",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "harbor"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "highway",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "highway"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "hospital",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "hospital"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "house",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "house"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "iceberg",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "iceberg"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "industrial area",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "industrial area"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "jail cell",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "jail cell"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "junkyard",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "junkyard"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "kitchen",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "kitchen"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "indoor library",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "indoor library"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "lighthouse",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "lighthouse"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "laboratory",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "laboratory"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "mansion",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "mansion"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "marsh",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "marsh"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "mountain",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "mountain"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "indoor movie theater",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "indoor movie theater"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "indoor museum",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "indoor museum"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "music studio",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "music studio"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "nursery",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "nursery"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "ocean",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "ocean"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "office",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "office"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "palace",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "palace"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "parking lot",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "parking lot"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "pharmacy",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "pharmacy"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "phone booth",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "phone booth"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "raceway",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "raceway"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "restaurant",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "restaurant"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "river",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "river"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "science museum",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "science museum"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "shower",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "shower"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "ski slope",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "ski slope"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "sky",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "sky"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "skyscraper",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "skyscraper"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "baseball stadium",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "baseball stadium"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "staircase",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "staircase"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "street",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "street"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "supermarket",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "supermarket"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "indoor swimming pool",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "indoor swimming pool"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "tower",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "tower"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "outdoor track",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "outdoor track"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "train railway",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "train railway"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "train station platform",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "train station platform"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "underwater coral reef",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "underwater coral reef"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "valley",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "valley"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "volcano",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "volcano"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "waterfall",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "waterfall"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "windmill",
+        "dimension": [
+            "scene",
+            "background_consistency"
+        ],
+        "auxiliary_info": {
+            "scene": {
+                "scene": {
+                    "scene": "windmill"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bicycle on the left of a car, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bicycle",
+                    "object_b": "car",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a car on the right of a motorcycle, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "car",
+                    "object_b": "motorcycle",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a motorcycle on the left of a bus, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "motorcycle",
+                    "object_b": "bus",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bus on the right of a traffic light, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bus",
+                    "object_b": "traffic light",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a traffic light on the left of a fire hydrant, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "traffic light",
+                    "object_b": "fire hydrant",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a fire hydrant on the right of a stop sign, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "fire hydrant",
+                    "object_b": "stop sign",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a stop sign on the left of a parking meter, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "stop sign",
+                    "object_b": "parking meter",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a parking meter on the right of a bench, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "parking meter",
+                    "object_b": "bench",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bench on the left of a truck, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bench",
+                    "object_b": "truck",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a truck on the right of a bicycle, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "truck",
+                    "object_b": "bicycle",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bird on the left of a cat, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bird",
+                    "object_b": "cat",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a cat on the right of a dog, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "cat",
+                    "object_b": "dog",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a dog on the left of a horse, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "dog",
+                    "object_b": "horse",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a horse on the right of a sheep, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "horse",
+                    "object_b": "sheep",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a sheep on the left of a cow, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "sheep",
+                    "object_b": "cow",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a cow on the right of an elephant, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "cow",
+                    "object_b": "elephant",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an elephant on the left of a bear, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "elephant",
+                    "object_b": "bear",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bear on the right of a zebra, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bear",
+                    "object_b": "zebra",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a zebra on the left of a giraffe, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "zebra",
+                    "object_b": "giraffe",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a giraffe on the right of a bird, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "giraffe",
+                    "object_b": "bird",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bottle on the left of a wine glass, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bottle",
+                    "object_b": "wine glass",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a wine glass on the right of a cup, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "wine glass",
+                    "object_b": "cup",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a cup on the left of a fork, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "cup",
+                    "object_b": "fork",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a fork on the right of a knife, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "fork",
+                    "object_b": "knife",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a knife on the left of a spoon, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "knife",
+                    "object_b": "spoon",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a spoon on the right of a bowl, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "spoon",
+                    "object_b": "bowl",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bowl on the left of a bottle, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bowl",
+                    "object_b": "bottle",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a potted plant on the left of a remote, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "potted plant",
+                    "object_b": "remote",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a remote on the right of a clock, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "remote",
+                    "object_b": "clock",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a clock on the left of a vase, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "clock",
+                    "object_b": "vase",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a vase on the right of scissors, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "vase",
+                    "object_b": "scissors",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "scissors on the left of a teddy bear, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "scissors",
+                    "object_b": "teddy bear",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a teddy bear on the right of a potted plant, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "teddy bear",
+                    "object_b": "potted plant",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a frisbee on the left of a sports ball, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "frisbee",
+                    "object_b": "sports ball",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a sports ball on the right of a baseball bat, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "sports ball",
+                    "object_b": "baseball bat",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a baseball bat on the left of a baseball glove, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "baseball bat",
+                    "object_b": "baseball glove",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a baseball glove on the right of a tennis racket, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "baseball glove",
+                    "object_b": "tennis racket",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a tennis racket on the left of a frisbee, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "tennis racket",
+                    "object_b": "frisbee",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a toilet on the left of a hair drier, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "toilet",
+                    "object_b": "hair drier",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a hair drier on the right of a toothbrush, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "hair drier",
+                    "object_b": "toothbrush",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a toothbrush on the left of a sink, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "toothbrush",
+                    "object_b": "sink",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a sink on the right of a toilet, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "sink",
+                    "object_b": "toilet",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a chair on the left of a couch, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "chair",
+                    "object_b": "couch",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a couch on the right of a bed, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "couch",
+                    "object_b": "bed",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a bed on the left of a tv, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "bed",
+                    "object_b": "tv",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a tv on the right of a dining table, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "tv",
+                    "object_b": "dining table",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a dining table on the left of a chair, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "dining table",
+                    "object_b": "chair",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an airplane on the left of a train, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "airplane",
+                    "object_b": "train",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a train on the right of a boat, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "train",
+                    "object_b": "boat",
+                    "relationship": "on the right of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a boat on the left of an airplane, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "boat",
+                    "object_b": "airplane",
+                    "relationship": "on the left of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an oven on the top of a toaster, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "oven",
+                    "object_b": "toaster",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an oven on the bottom of a toaster, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "oven",
+                    "object_b": "toaster",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a toaster on the top of a microwave, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "toaster",
+                    "object_b": "microwave",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a toaster on the bottom of a microwave, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "toaster",
+                    "object_b": "microwave",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a microwave on the top of an oven, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "microwave",
+                    "object_b": "oven",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a microwave on the bottom of an oven, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "microwave",
+                    "object_b": "oven",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a banana on the top of an apple, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "banana",
+                    "object_b": "apple",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a banana on the bottom of an apple, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "banana",
+                    "object_b": "apple",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an apple on the top of a sandwich, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "apple",
+                    "object_b": "sandwich",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an apple on the bottom of a sandwich, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "apple",
+                    "object_b": "sandwich",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a sandwich on the top of an orange, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "sandwich",
+                    "object_b": "orange",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a sandwich on the bottom of an orange, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "sandwich",
+                    "object_b": "orange",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange on the top of a carrot, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "orange",
+                    "object_b": "carrot",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "an orange on the bottom of a carrot, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "orange",
+                    "object_b": "carrot",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a carrot on the top of a hot dog, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "carrot",
+                    "object_b": "hot dog",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a carrot on the bottom of a hot dog, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "carrot",
+                    "object_b": "hot dog",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a hot dog on the top of a pizza, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "hot dog",
+                    "object_b": "pizza",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a hot dog on the bottom of a pizza, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "hot dog",
+                    "object_b": "pizza",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a pizza on the top of a donut, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "pizza",
+                    "object_b": "donut",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a pizza on the bottom of a donut, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "pizza",
+                    "object_b": "donut",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a donut on the top of broccoli, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "donut",
+                    "object_b": "broccoli",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a donut on the bottom of broccoli, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "donut",
+                    "object_b": "broccoli",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "broccoli on the top of a banana, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "broccoli",
+                    "object_b": "banana",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "broccoli on the bottom of a banana, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "broccoli",
+                    "object_b": "banana",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "skis on the top of a snowboard, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "skis",
+                    "object_b": "snowboard",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "skis on the bottom of a snowboard, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "skis",
+                    "object_b": "snowboard",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a snowboard on the top of a kite, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "snowboard",
+                    "object_b": "kite",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a snowboard on the bottom of a kite, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "snowboard",
+                    "object_b": "kite",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a kite on the top of a skateboard, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "kite",
+                    "object_b": "skateboard",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a kite on the bottom of a skateboard, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "kite",
+                    "object_b": "skateboard",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a skateboard on the top of a surfboard, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "skateboard",
+                    "object_b": "surfboard",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a skateboard on the bottom of a surfboard, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "skateboard",
+                    "object_b": "surfboard",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a surfboard on the top of skis, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "surfboard",
+                    "object_b": "skis",
+                    "relationship": "on the top of"
+                }
+            }
+        }
+    },
+    {
+        "prompt_en": "a surfboard on the bottom of skis, front view",
+        "dimension": [
+            "spatial_relationship"
+        ],
+        "auxiliary_info": {
+            "spatial_relationship": {
+                "spatial_relationship": {
+                    "object_a": "surfboard",
+                    "object_b": "skis",
+                    "relationship": "on the bottom of"
+                }
+            }
+        }
+    }
+]
diff --git a/eval/vbench/calc_vbench.py b/eval/vbench/calc_vbench.py
new file mode 100644
index 0000000..e5570a1
--- /dev/null
+++ b/eval/vbench/calc_vbench.py
@@ -0,0 +1,75 @@
+import argparse
+import os
+import time
+
+import torch
+
+from vbench import VBench
+
+full_info_path = "eval/vbench/VBench_full_info.json"
+dimensions = [
+    # a: 10min
+    "subject_consistency",  # 4min
+    "imaging_quality",  # 6min
+    # b: 12min
+    "background_consistency",  # 2min
+    "motion_smoothness",  # 5min
+    "overall_consistency",  # 2min
+    "human_action",  # 3min
+    # c: 14min
+    "multiple_objects",  # 14min
+    # d: 14min
+    "spatial_relationship",  # 14min
+    # e: 12min
+    "object_class",  # 12min
+    # f: 12min
+    "color",  # 12min
+    # g: 10.5min
+    "aesthetic_quality",  # 2.5min
+    "appearance_style",  # 6min
+    "temporal_flickering",  # 2min
+    # h: 9min
+    "scene",  # 3min
+    "temporal_style",  # 2min
+    "dynamic_degree",  # 4min
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("video_folder", type=str)  # samples/samples..._vbench/eval
+    parser.add_argument("model_ckpt", type=str)
+    parser.add_argument("--start", type=int, default=0)  # start index of dimension to be evaluated
+    parser.add_argument("--end", type=int, default=-1)  # start index of dimension to be evaluated
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    output_dir = os.path.join(args.model_ckpt, "vbench")
+    os.makedirs(output_dir, exist_ok=True)
+    video_path = args.video_folder
+
+    kwargs = {}
+    kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default
+
+    start_time = time.time()
+
+    # NOTE: important to use torch.device("cuda"), else will have issue with object_class third_party module
+    my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
+    if args.end == -1:  # adjust end accordingly
+        args.end = len(dimensions)
+    for dim in dimensions[args.start : args.end]:
+        my_VBench.evaluate(
+            videos_path=video_path,
+            name=dim,
+            local=False,
+            read_frame=False,
+            dimension_list=[dim],
+            mode="vbench_standard",
+            **kwargs,
+        )
+
+    print("Runtime: %s seconds " % (time.time() - start_time))
diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh
new file mode 100644
index 0000000..fa0aba0
--- /dev/null
+++ b/eval/vbench/launch.sh
@@ -0,0 +1,61 @@
+# !/bin/bash
+
+CKPT=$1
+NUM_FRAMES=$2
+MODEL_NAME=$3
+RES=$4
+ASP_RATIO=$5
+
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
+if [[ $CKPT == *"ema"* ]]; then
+    parentdir=$(dirname $CKPT)
+    CKPT_BASE=$(basename $parentdir)_ema
+else
+    CKPT_BASE=$(basename $CKPT)
+fi
+# LOG_BASE=$(dirname $CKPT)/eval
+LOG_BASE=./sample/eval
+
+echo "Logging to $LOG_BASE"
+
+GPUS=(0 1 2 3 4 5 6 7)
+TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
+START_INDEX_LIST=(0 120 240 360 480 600 720 840)
+END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
+
+## Modify the following to run on multiple machines for faster results
+## 720p will take quite long on a single machine
+# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
+# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
+# LOG_BASE=$(dirname $CKPT)/eval/last_60
+# mkdir -p ${LOG_BASE}
+# echo "Logging to $LOG_BASE"
+
+
+
+for i in "${!GPUS[@]}"; do
+    if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
+        then
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+        else
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
+    fi
+done
diff --git a/eval/vbench/launch_calc.sh b/eval/vbench/launch_calc.sh
new file mode 100644
index 0000000..9f14ce5
--- /dev/null
+++ b/eval/vbench/launch_calc.sh
@@ -0,0 +1,16 @@
+# !/bin/bash
+
+VIDEO_DIR=$1
+CKPT_DIR=$2
+LOG_BASE=$CKPT_DIR
+mkdir -p $LOG_BASE
+echo "Logging to $LOG_BASE"
+
+GPUS=(0 1 2 3 4 5 6 7)
+START_INDEX_LIST=(0 2 6 7 8 9 10 13)
+END_INDEX_LIST=(2 6 7 8 9 10 13 16)
+TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
+
+for i in "${!GPUS[@]}"; do
+    CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+done
diff --git a/eval/vbench/tabulate_vbench_scores.py b/eval/vbench/tabulate_vbench_scores.py
new file mode 100644
index 0000000..65d4570
--- /dev/null
+++ b/eval/vbench/tabulate_vbench_scores.py
@@ -0,0 +1,155 @@
+import argparse
+import json
+import os
+
+SEMANTIC_WEIGHT = 1
+QUALITY_WEIGHT = 4
+
+QUALITY_LIST = [
+    "subject consistency",
+    "background consistency",
+    "temporal flickering",
+    "motion smoothness",
+    "aesthetic quality",
+    "imaging quality",
+    "dynamic degree",
+]
+
+SEMANTIC_LIST = [
+    "object class",
+    "multiple objects",
+    "human action",
+    "color",
+    "spatial relationship",
+    "scene",
+    "appearance style",
+    "temporal style",
+    "overall consistency",
+]
+
+NORMALIZE_DIC = {
+    "subject consistency": {"Min": 0.1462, "Max": 1.0},
+    "background consistency": {"Min": 0.2615, "Max": 1.0},
+    "temporal flickering": {"Min": 0.6293, "Max": 1.0},
+    "motion smoothness": {"Min": 0.706, "Max": 0.9975},
+    "dynamic degree": {"Min": 0.0, "Max": 1.0},
+    "aesthetic quality": {"Min": 0.0, "Max": 1.0},
+    "imaging quality": {"Min": 0.0, "Max": 1.0},
+    "object class": {"Min": 0.0, "Max": 1.0},
+    "multiple objects": {"Min": 0.0, "Max": 1.0},
+    "human action": {"Min": 0.0, "Max": 1.0},
+    "color": {"Min": 0.0, "Max": 1.0},
+    "spatial relationship": {"Min": 0.0, "Max": 1.0},
+    "scene": {"Min": 0.0, "Max": 0.8222},
+    "appearance style": {"Min": 0.0009, "Max": 0.2855},
+    "temporal style": {"Min": 0.0, "Max": 0.364},
+    "overall consistency": {"Min": 0.0, "Max": 0.364},
+}
+
+DIM_WEIGHT = {
+    "subject consistency": 1,
+    "background consistency": 1,
+    "temporal flickering": 1,
+    "motion smoothness": 1,
+    "aesthetic quality": 1,
+    "imaging quality": 1,
+    "dynamic degree": 0.5,
+    "object class": 1,
+    "multiple objects": 1,
+    "human action": 1,
+    "color": 1,
+    "spatial relationship": 1,
+    "scene": 1,
+    "appearance style": 1,
+    "temporal style": 1,
+    "overall consistency": 1,
+}
+
+ordered_scaled_res = [
+    "total score",
+    "quality score",
+    "semantic score",
+    "subject consistency",
+    "background consistency",
+    "temporal flickering",
+    "motion smoothness",
+    "dynamic degree",
+    "aesthetic quality",
+    "imaging quality",
+    "object class",
+    "multiple objects",
+    "human action",
+    "color",
+    "spatial relationship",
+    "scene",
+    "appearance style",
+    "temporal style",
+    "overall consistency",
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--score_dir", type=str)  # ckpt_dir/eval/vbench
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    res_postfix = "_eval_results.json"
+    info_postfix = "_full_info.json"
+    files = os.listdir(args.score_dir)
+    res_files = [x for x in files if res_postfix in x]
+    info_files = [x for x in files if info_postfix in x]
+    assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"
+
+    full_results = {}
+    for res_file in res_files:
+        # first check if results is normal
+        info_file = res_file.split(res_postfix)[0] + info_postfix
+        with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
+            info = json.load(f)
+            assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
+        # read results
+        with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
+            data = json.load(f)
+            for key, val in data.items():
+                full_results[key] = format(val[0], ".4f")
+
+    scaled_results = {}
+    dims = set()
+    for key, val in full_results.items():
+        dim = key.replace("_", " ") if "_" in key else key
+        scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / (
+            NORMALIZE_DIC[dim]["Max"] - NORMALIZE_DIC[dim]["Min"]
+        )
+        scaled_score *= DIM_WEIGHT[dim]
+        scaled_results[dim] = scaled_score
+        dims.add(dim)
+
+    assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet"
+
+    quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
+    semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST])
+    scaled_results["quality score"] = quality_score
+    scaled_results["semantic score"] = semantic_score
+    scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (
+        QUALITY_WEIGHT + SEMANTIC_WEIGHT
+    )
+
+    formated_scaled_results = {"items": []}
+    for key in ordered_scaled_res:
+        # formated_scaled_results[key] = format(val * 100, ".2f") + "%"
+        formated_score = format(scaled_results[key] * 100, ".2f") + "%"
+        formated_scaled_results["items"].append({key: formated_score})
+
+    output_file_path = os.path.join(args.score_dir, "all_results.json")
+    with open(output_file_path, "w") as outfile:
+        json.dump(full_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {output_file_path}")
+
+    scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
+    with open(scaled_file_path, "w") as outfile:
+        json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {scaled_file_path}")
diff --git a/eval/vbench_i2v/calc_vbench_i2v.py b/eval/vbench_i2v/calc_vbench_i2v.py
new file mode 100644
index 0000000..b8d5901
--- /dev/null
+++ b/eval/vbench_i2v/calc_vbench_i2v.py
@@ -0,0 +1,71 @@
+import argparse
+import os
+import time
+
+import torch
+
+from vbench import VBench
+from vbench2_beta_i2v import VBenchI2V
+
+full_info_path = "eval/vbench_i2v/vbench2_i2v_full_info.json"
+video_quality_dimensions = [
+    "subject_consistency",
+    "background_consistency",
+    "motion_smoothness",
+    "dynamic_degree",
+    "aesthetic_quality",
+    "imaging_quality",
+    "temporal_flickering",
+]
+i2v_dimensions = ["i2v_subject", "i2v_background", "camera_motion"]
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("video_folder", type=str)  # samples/samples..._vbench_i2v/
+    parser.add_argument("model_ckpt", type=str)
+    parser.add_argument("--start", type=int, default=0)  # start index of dimension to be evaluated
+    parser.add_argument("--end", type=int, default=-1)  # start index of dimension to be evaluated
+    parser.add_argument("--calc_i2v", type=str2bool, default=True)
+    parser.add_argument("--calc_quality", type=str2bool, default=True)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    output_dir = os.path.join(args.model_ckpt, "vbench_i2v")
+    os.makedirs(output_dir, exist_ok=True)
+    video_path = args.video_folder
+
+    start_time = time.time()
+
+    if args.calc_i2v:
+        my_VBench_I2V = VBenchI2V(torch.device("cuda"), full_info_path, output_dir)
+        end = args.end if args.end != -1 else len(i2v_dimensions)
+        for i2v_dim in i2v_dimensions[args.start : end]:
+            my_VBench_I2V.evaluate(videos_path=video_path, name=i2v_dim, dimension_list=[i2v_dim], resolution="1-1")
+
+    kwargs = {}
+    kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default
+
+    if args.calc_quality:
+        my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
+        end = args.end if args.end != -1 else len(video_quality_dimensions)
+        for quality_dim in video_quality_dimensions[args.start : end]:
+            my_VBench.evaluate(
+                videos_path=video_path, name=quality_dim, dimension_list=[quality_dim], mode="vbench_standard", **kwargs
+            )
+
+    print("Runtime: %s seconds " % (time.time() - start_time))
diff --git a/eval/vbench_i2v/json_to_txt.py b/eval/vbench_i2v/json_to_txt.py
new file mode 100644
index 0000000..7631d62
--- /dev/null
+++ b/eval/vbench_i2v/json_to_txt.py
@@ -0,0 +1,17 @@
+import json
+import os
+
+RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"]
+
+cache_root = "/mnt/jfs-hdd/sora/data/vbench-i2v/crop"
+resolution = RESOLUTIONS[0]
+json_file = "vbench2_i2v_full_info.json"
+save_path = "all_i2v.txt"
+
+data = json.load(open(json_file))
+txt = [
+    f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0"}}'
+    for x in data
+]
+with open(save_path, "w") as f:
+    f.write("\n".join(txt))
diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh
new file mode 100644
index 0000000..193c581
--- /dev/null
+++ b/eval/vbench_i2v/launch.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+CKPT=$1
+NUM_FRAMES=$2
+MODEL_NAME=$3
+RES=$4
+ASP_RATIO=$5
+
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
+if [[ $CKPT == *"ema"* ]]; then
+    parentdir=$(dirname $CKPT)
+    CKPT_BASE=$(basename $parentdir)_ema
+else
+    CKPT_BASE=$(basename $CKPT)
+fi
+LOG_BASE=$(dirname $CKPT)/eval
+echo "Logging to $LOG_BASE"
+
+GPUS=(0 1 2 3 4 5 6 7)
+TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
+START_INDEX_LIST=(0 140 280 420 560 700 840 980)
+END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
+
+
+for i in "${!GPUS[@]}"; do
+    if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
+        then
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+        else
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
+    fi
+done
diff --git a/eval/vbench_i2v/launch_calc.sh b/eval/vbench_i2v/launch_calc.sh
new file mode 100644
index 0000000..a55baf4
--- /dev/null
+++ b/eval/vbench_i2v/launch_calc.sh
@@ -0,0 +1,19 @@
+# !/bin/bash
+
+VIDEO_DIR=$1
+CKPT_DIR=$2
+LOG_BASE=$CKPT_DIR
+mkdir -p $LOG_BASE
+echo "Logging to $LOG_BASE"
+
+GPUS=(0 1 2 3 4 5 6 7)
+CALC_I2V_LIST=(True True False False False False False False)
+CALC_QUALITY_LIST=(False False True True True True True True)
+START_INDEX_LIST=(0 2 0 2 3 4 5 6)
+END_INDEX_LIST=(2 -1 2 3 4 5 6 -1)
+TASK_ID_LIST=(calc_vbench_i2v_a calc_vbench_i2v_b calc_vbench_i2v_c calc_vbench_i2v_d calc_vbench_i2v_e calc_vbench_i2v_f calc_vbench_i2v_g calc_vbench_i2v_h) # for log records only
+
+
+for i in "${!GPUS[@]}"; do
+    CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench_i2v/calc_vbench_i2v.py $VIDEO_DIR $CKPT_DIR --calc_i2v ${CALC_I2V_LIST[i]} --calc_quality ${CALC_QUALITY_LIST[i]} --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+done
diff --git a/eval/vbench_i2v/tabulate_vbench_i2v_scores.py b/eval/vbench_i2v/tabulate_vbench_i2v_scores.py
new file mode 100644
index 0000000..097f276
--- /dev/null
+++ b/eval/vbench_i2v/tabulate_vbench_i2v_scores.py
@@ -0,0 +1,132 @@
+import argparse
+import json
+import os
+
+I2V_WEIGHT = 1.0
+I2V_QUALITY_WEIGHT = 1.0
+
+I2V_LIST = [
+    "i2v_subject",
+    "i2v_background",
+]
+
+I2V_QUALITY_LIST = [
+    "subject_consistency",
+    "background_consistency",
+    "temporal_flickering",
+    "motion_smoothness",
+    "aesthetic_quality",
+    "imaging_quality",
+    "dynamic_degree",
+]
+
+DIM_WEIGHT_I2V = {
+    "camera_motion": 0.1,
+    "i2v_subject": 1,
+    "i2v_background": 1,
+    "subject_consistency": 1,
+    "background_consistency": 1,
+    "motion_smoothness": 1,
+    "dynamic_degree": 0.5,
+    "aesthetic_quality": 1,
+    "imaging_quality": 1,
+    "temporal_flickering": 1,
+}
+
+NORMALIZE_DIC_I2V = {
+    "camera_motion": {"Min": 0.0, "Max": 1.0},
+    "i2v_subject": {"Min": 0.1462, "Max": 1.0},
+    "i2v_background": {"Min": 0.2615, "Max": 1.0},
+    "subject_consistency": {"Min": 0.1462, "Max": 1.0},
+    "background_consistency": {"Min": 0.2615, "Max": 1.0},
+    "motion_smoothness": {"Min": 0.7060, "Max": 0.9975},
+    "dynamic_degree": {"Min": 0.0, "Max": 1.0},
+    "aesthetic_quality": {"Min": 0.0, "Max": 1.0},
+    "imaging_quality": {"Min": 0.0, "Max": 1.0},
+    "temporal_flickering": {"Min": 0.6293, "Max": 1.0},
+}
+
+ordered_scaled_res = [
+    "total score",
+    "i2v score",
+    "quality score",
+    "camera_motion",
+    "i2v_subject",
+    "i2v_background",
+    "subject_consistency",
+    "background_consistency",
+    "motion_smoothness",
+    "dynamic_degree",
+    "aesthetic_quality",
+    "imaging_quality",
+    "temporal_flickering",
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--score_dir", type=str)  # ckpt_dir/eval/vbench_i2v
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    res_postfix = "_eval_results.json"
+    info_postfix = "_full_info.json"
+    files = os.listdir(args.score_dir)
+    res_files = [x for x in files if res_postfix in x]
+    info_files = [x for x in files if info_postfix in x]
+    assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"
+
+    full_results = {}
+    for res_file in res_files:
+        # first check if results is normal
+        info_file = res_file.split(res_postfix)[0] + info_postfix
+        with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
+            info = json.load(f)
+            assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
+        # read results
+        with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
+            data = json.load(f)
+            for key, val in data.items():
+                full_results[key] = format(val[0], ".4f")
+
+    scaled_results = {}
+    dims = set()
+    for key, val in full_results.items():
+        dim = key
+        scaled_score = (float(val) - NORMALIZE_DIC_I2V[dim]["Min"]) / (
+            NORMALIZE_DIC_I2V[dim]["Max"] - NORMALIZE_DIC_I2V[dim]["Min"]
+        )
+        scaled_score *= DIM_WEIGHT_I2V[dim]
+        scaled_results[dim] = scaled_score
+        dims.add(dim)
+
+    assert len(dims) == len(NORMALIZE_DIC_I2V), f"{set(NORMALIZE_DIC_I2V.keys())-dims} not calculated yet"
+
+    quality_score = sum([scaled_results[i] for i in I2V_QUALITY_LIST]) / sum(
+        [DIM_WEIGHT_I2V[i] for i in I2V_QUALITY_LIST]
+    )
+    i2v_score = sum([scaled_results[i] for i in I2V_LIST]) / sum([DIM_WEIGHT_I2V[i] for i in I2V_LIST])
+
+    scaled_results["quality score"] = quality_score
+    scaled_results["i2v score"] = i2v_score
+    scaled_results["total score"] = (quality_score * I2V_QUALITY_WEIGHT + i2v_score * I2V_WEIGHT) / (
+        I2V_QUALITY_WEIGHT + I2V_WEIGHT
+    )
+
+    formated_scaled_results = {"item": []}
+    for key in ordered_scaled_res:
+        formated_res = format(scaled_results[key] * 100, ".2f") + "%"
+        formated_scaled_results["item"].append({key: formated_res})
+
+    output_file_path = os.path.join(args.score_dir, "all_results.json")
+    with open(output_file_path, "w") as outfile:
+        json.dump(full_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {output_file_path}")
+
+    scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
+    with open(scaled_file_path, "w") as outfile:
+        json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {scaled_file_path}")
diff --git a/eval/vbench_i2v/vbench2_i2v_full_info.json b/eval/vbench_i2v/vbench2_i2v_full_info.json
new file mode 100644
index 0000000..3aa6651
--- /dev/null
+++ b/eval/vbench_i2v/vbench2_i2v_full_info.json
@@ -0,0 +1,10858 @@
+[
+    {
+        "prompt_en": "a close up of a blue and orange liquid",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "a close up of a blue and orange liquid, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "a close up of a blue and orange liquid, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "a close up of a blue and orange liquid, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "a close up of a blue and orange liquid, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "a close up of a blue and orange liquid, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "a close up of a blue and orange liquid, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "a close up of a blue and orange liquid, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close up of a blue and orange liquid.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "A black and white abstract video featuring mesmerizing bubbles, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "A black and white abstract video featuring mesmerizing bubbles.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a blue and white smoke is swirly in the dark, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a blue and white smoke is swirly in the dark.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a sea fan in the water, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a close-up view of a sea fan in the water.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a visually captivating abstract video, rich in color, set against a dramatic black background, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a visually captivating abstract video, rich in color, set against a dramatic black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a purple and yellow abstract painting with a black background, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a purple and yellow abstract painting with a black background.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a dynamic video of a blurry neon light in the dark, radiating captivating colors, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a dynamic video of a blurry neon light in the dark, radiating captivating colors.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a view of a star trail in the night sky, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "abstract",
+        "image_name": "a view of a star trail in the night sky.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a small town on the edge of the ocean, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a small town on the edge of the ocean.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "Colorful buildings on the seaside cliffs, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "Colorful buildings on the seaside cliffs.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a bunch of houses that are on a hillside, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bunch of houses that are on a hillside.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "a building that is sitting on the side of a pond, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a building that is sitting on the side of a pond.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a busy city with a bridge in the background, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a busy city with a bridge in the background.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is over a body of water, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a bridge that is over a body of water.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a pile of wood sitting next to a log house, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pile of wood sitting next to a log house.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "a view of a snowy mountain side with many buildings, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a view of a snowy mountain side with many buildings.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "san francisco skyline at sunset, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "san francisco skyline at sunset.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "a castle on top of a hill covered in snow, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a castle on top of a hill covered in snow.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of big ben and the houses of parliament in london, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of big ben and the houses of parliament in london.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "a beach with a lot of buildings on the side of a cliff, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a beach with a lot of buildings on the side of a cliff.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "an alley way in an old european city, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an alley way in an old european city.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the golden gate bridge in san franscisco is lit up by the setting sun, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the golden gate bridge in san franscisco is lit up by the setting sun.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the great wall of china in autumn, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the great wall of china in autumn.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "the town of hallstatt is surrounded by mountains and water, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the town of hallstatt is surrounded by mountains and water.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "tokyo skyline at night, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tokyo skyline at night.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large wave crashes into a lighthouse.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "a church sits on top of a hill under a cloudy sky, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a church sits on top of a hill under a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "the parthenon in acropolis, greece, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the parthenon in acropolis, greece.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "a large crowd of people walking in a shopping mall, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a large crowd of people walking in a shopping mall.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "the pyramids of giza, egypt, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the pyramids of giza, egypt.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a stage door painted with a star on the side of a brick wall, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a stage door painted with a star on the side of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "a light house on the edge of the water, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a light house on the edge of the water.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "an asian city street at night with people and bicycles, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an asian city street at night with people and bicycles.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a couple of wooden benches in the middle of a street, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a couple of wooden benches in the middle of a street.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a pagoda sits on top of a mountain in japan, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a pagoda sits on top of a mountain in japan.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a red bus driving down a snowy street at night, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a red bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a snow covered street, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a snow covered street.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a house with snow on the ground, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a house with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "cars parked on the side of the road during a snowstorm, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "cars parked on the side of the road during a snowstorm.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of statues on the side of a building, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a group of statues on the side of a building.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "a city street at night during a snow storm, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a city street at night during a snow storm.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "tower bridge in london, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "tower bridge in london.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "chinese pagoda in the middle of a snowy day, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "chinese pagoda in the middle of a snowy day.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a dark alleyway with a bus driving down it, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a dark alleyway with a bus driving down it.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "a monastery sits on top of a cliff in bhutan, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a monastery sits on top of a cliff in bhutan.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "the dome of the rock in jerusalem, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "the dome of the rock in jerusalem.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a futuristic building on a cliff overlooking a body of water, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "an aerial view of a futuristic building on a cliff overlooking a body of water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a reflection of a city with buildings in the water, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "architecture",
+        "image_name": "a reflection of a city with buildings in the water.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a bar with chairs and a television on the wall, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a bar with chairs and a television on the wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with lots of books on a wall, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with lots of books on a wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a living room filled with furniture next to a stone wall, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room filled with furniture next to a stone wall.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with sunlight coming through the window, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with sunlight coming through the window.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "a room filled with lots of shelves filled with books, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with lots of shelves filled with books.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "an art gallery with paintings on the walls, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an art gallery with paintings on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a room with a lot of pictures on the walls, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a lot of pictures on the walls.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a painting of a cloudy sky next to an easel, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a painting of a cloudy sky next to an easel.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a living room with a christmas tree and a rocking chair, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a christmas tree and a rocking chair.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a kitchen with a sink and a lot of glasses on the counter, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a kitchen with a sink and a lot of glasses on the counter.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a wooden table in front of a brick wall with bottles on the wall, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a wooden table in front of a brick wall with bottles on the wall.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "a room filled with paintings and statues, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with paintings and statues.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "an outdoor dining area surrounded by plants and a brick walkway, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "an outdoor dining area surrounded by plants and a brick walkway.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a room filled with books and teddy bears, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room filled with books and teddy bears.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a table and chairs in a room with a plant in the corner, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a table and chairs in a room with a plant in the corner.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a living room with a couch, table, and a window, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with a couch, table, and a window.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a modern living room with wood floors and a tv, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a modern living room with wood floors and a tv.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a room with a desk and a chair in it, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a room with a desk and a chair in it.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a building, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a large waterfall in the middle of a building.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a chair in a room next to some drawings, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a chair in a room next to some drawings.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "a living room with hardwood floors and a white couch, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "indoor",
+        "image_name": "a living room with hardwood floors and a white couch.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "two people in a canoe on a lake with mountains in the background, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two people in a canoe on a lake with mountains in the background.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a snowy road in a forest, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a snowy road in a forest.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a view of a waterfall from a distance, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a view of a waterfall from a distance.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a valley, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a valley.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a group of islands in the middle of a lake, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a group of islands in the middle of a lake.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a rocky beach in indonesia, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a rocky beach in indonesia.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "fireworks in the night sky over a city, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "fireworks in the night sky over a city.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes into a lighthouse on a stormy day, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes into a lighthouse on a stormy day.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with a sky background, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with a sky background.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a large bonfire is burning in the night sky, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large bonfire is burning in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of the flames of a fireplace, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a close-up view of the flames of a fireplace.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a farm in the middle of the day, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a farm in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a flock of birds flying over a tree at sunset, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a flock of birds flying over a tree at sunset.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a captivating scene featuring a spiral galaxy shining brilliantly in the night sky.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a mountain with snow on it, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain with snow on it.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a bridge that is in the middle of a river, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a bridge that is in the middle of a river.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a group of people standing on top of a green hill, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of people standing on top of a green hill.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with a wooden pier in the water, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with a wooden pier in the water.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a lake surrounded by mountains and flowers, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a lake surrounded by mountains and flowers.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "a hot-air balloon flying over a desert landscape, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a hot-air balloon flying over a desert landscape.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "several hot air balloons flying over a city, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "several hot air balloons flying over a city.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a group of hot air balloons flying over a field, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a group of hot air balloons flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "a large wave crashes over a rocky cliff, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave crashes over a rocky cliff.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "the sun is setting over a lake in the mountains, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is setting over a lake in the mountains.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "a mountain range with snow on the ground, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain range with snow on the ground.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "sun rays shining through clouds over a lake, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "sun rays shining through clouds over a lake.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a boat sits on the shore of a lake with mt fuji in the background, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a boat sits on the shore of a lake with mt fuji in the background.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy road with trees in the distance, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy road with trees in the distance.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "two swans swimming on a lake in the fog, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "two swans swimming on a lake in the fog.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "the sun is shining through the trees near a waterfall, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "the sun is shining through the trees near a waterfall.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "a sandy beach with palm trees on the shore, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a sandy beach with palm trees on the shore.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a body of water and a beach, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a body of water and a beach.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy field that has trees in the grass, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy field that has trees in the grass.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a foggy landscape with trees and hills in the distance, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a foggy landscape with trees and hills in the distance.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a large wave in the ocean with a lot of spray coming from it, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large wave in the ocean with a lot of spray coming from it.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a waterfall is shown in the middle of a lush green hillside, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a waterfall is shown in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "an aerial view of a curvy road in the middle of a forest, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "an aerial view of a curvy road in the middle of a forest.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a mountain covered in snow with evergreen trees, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a mountain covered in snow with evergreen trees.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a very large waterfall in the middle of the day, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a very large waterfall in the middle of the day.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside",
+        "dimension": [
+            "i2v_background",
+            "background_consistency",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside, camera pans left",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside, camera pans right",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside, camera tilts up",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside, camera tilts down",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside, camera zooms in",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside, camera zooms out",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a large waterfall in the middle of a lush green hillside, camera static",
+        "dimension": [
+            "camera_motion"
+        ],
+        "image_type": "scenery",
+        "image_name": "a large waterfall in the middle of a lush green hillside.jpg"
+    },
+    {
+        "prompt_en": "a brown bear in the water with a fish in its mouth",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a brown bear in the water with a fish in its mouth.jpg"
+    },
+    {
+        "prompt_en": "a close-up of a hippopotamus eating grass in a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a close-up of a hippopotamus eating grass in a field.jpg"
+    },
+    {
+        "prompt_en": "a sea turtle swimming in the ocean under the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a sea turtle swimming in the ocean under the water.jpg"
+    },
+    {
+        "prompt_en": "two bees are flying over a lavender plant",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "two bees are flying over a lavender plant.jpg"
+    },
+    {
+        "prompt_en": "the otter is standing in the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "the otter is standing in the water.jpg"
+    },
+    {
+        "prompt_en": "a dog carrying a soccer ball in its mouth",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a dog carrying a soccer ball in its mouth.jpg"
+    },
+    {
+        "prompt_en": "an eagle is flying over a mountain with trees in the background",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "an eagle is flying over a mountain with trees in the background.jpg"
+    },
+    {
+        "prompt_en": "a couple of horses are running in the dirt",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a couple of horses are running in the dirt.jpg"
+    },
+    {
+        "prompt_en": "a highland cow with long horns standing in a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a highland cow with long horns standing in a field.jpg"
+    },
+    {
+        "prompt_en": "a monkey is holding a banana in its mouth",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a monkey is holding a banana in its mouth.jpg"
+    },
+    {
+        "prompt_en": "a large rhino grazing in the grass near a bush",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a large rhino grazing in the grass near a bush.jpg"
+    },
+    {
+        "prompt_en": "a butterfly sits on top of a purple flower",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a butterfly sits on top of a purple flower.jpg"
+    },
+    {
+        "prompt_en": "an alligator is covered in green plants in the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "an alligator is covered in green plants in the water.jpg"
+    },
+    {
+        "prompt_en": "a red panda eating bamboo in a zoo",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a red panda eating bamboo in a zoo.jpg"
+    },
+    {
+        "prompt_en": "a monochromatic video capturing a cat's gaze into the camera",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a monochromatic video capturing a cat's gaze into the camera.jpg"
+    },
+    {
+        "prompt_en": "a frog sitting on top of water lily leaves",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a frog sitting on top of water lily leaves.jpg"
+    },
+    {
+        "prompt_en": "a lion is roaring in the wild",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a lion is roaring in the wild.jpg"
+    },
+    {
+        "prompt_en": "a seagull is flying towards a person's hand",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a seagull is flying towards a person's hand.jpg"
+    },
+    {
+        "prompt_en": "a yellow and white jellyfish is floating in the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a yellow and white jellyfish is floating in the ocean.jpg"
+    },
+    {
+        "prompt_en": "a group of jellyfish swimming in an aquarium",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a group of jellyfish swimming in an aquarium.jpg"
+    },
+    {
+        "prompt_en": "a clown fish hiding in a purple anemone",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a clown fish hiding in a purple anemone.jpg"
+    },
+    {
+        "prompt_en": "a snake sitting on the ground next to a bowl",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a snake sitting on the ground next to a bowl.jpg"
+    },
+    {
+        "prompt_en": "a brown and white cow eating hay",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a brown and white cow eating hay.jpg"
+    },
+    {
+        "prompt_en": "a seal swimming in the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a seal swimming in the water.jpg"
+    },
+    {
+        "prompt_en": "a panda bear is eating a piece of bamboo",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a panda bear is eating a piece of bamboo.jpg"
+    },
+    {
+        "prompt_en": "a small bird sits on a moss covered branch",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a small bird sits on a moss covered branch.jpg"
+    },
+    {
+        "prompt_en": "a bird with a fish in its beak flying over a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a bird with a fish in its beak flying over a field.jpg"
+    },
+    {
+        "prompt_en": "a large flock of birds flying in the sky",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a large flock of birds flying in the sky.jpg"
+    },
+    {
+        "prompt_en": "a bald eagle flying over a tree filled forest",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a bald eagle flying over a tree filled forest.jpg"
+    },
+    {
+        "prompt_en": "a giraffe walking in a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a giraffe walking in a field.jpg"
+    },
+    {
+        "prompt_en": "a lioness yawning in a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a lioness yawning in a field.jpg"
+    },
+    {
+        "prompt_en": "a little crab scurried on the sandy beach",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a little crab scurried on the sandy beach.jpg"
+    },
+    {
+        "prompt_en": "a warthog is walking in the grass",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a warthog is walking in the grass.jpg"
+    },
+    {
+        "prompt_en": "a penguin walking on a beach near the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a penguin walking on a beach near the water.jpg"
+    },
+    {
+        "prompt_en": "a tiger walking through a wooded area",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a tiger walking through a wooded area.jpg"
+    },
+    {
+        "prompt_en": "a tiger walking on a dirt path in the woods",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a tiger walking on a dirt path in the woods.jpg"
+    },
+    {
+        "prompt_en": "a small monkey holding a piece of food in it's mouth",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a small monkey holding a piece of food in it's mouth.jpg"
+    },
+    {
+        "prompt_en": "a squirrel sitting on the ground eating a piece of bread",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a squirrel sitting on the ground eating a piece of bread.jpg"
+    },
+    {
+        "prompt_en": "a group of fish swimming over a coral reef",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a group of fish swimming over a coral reef.jpg"
+    },
+    {
+        "prompt_en": "a toad is sitting on top of some moss",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a toad is sitting on top of some moss.jpg"
+    },
+    {
+        "prompt_en": "a great white shark swimming in the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a great white shark swimming in the ocean.jpg"
+    },
+    {
+        "prompt_en": "a group of camels resting in the desert",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a group of camels resting in the desert.jpg"
+    },
+    {
+        "prompt_en": "two sheep grazing in the grass next to a wooden bridge",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "two sheep grazing in the grass next to a wooden bridge.jpg"
+    },
+    {
+        "prompt_en": "an elephant walking through a forest",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "an elephant walking through a forest.jpg"
+    },
+    {
+        "prompt_en": "a white rooster standing in a grassy field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a white rooster standing in a grassy field.jpg"
+    },
+    {
+        "prompt_en": "a zebra walking across a dirt road near a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "animal",
+        "image_name": "a zebra walking across a dirt road near a field.jpg"
+    },
+    {
+        "prompt_en": "cars are driving down a street lined with tall trees",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "cars are driving down a street lined with tall trees.jpg"
+    },
+    {
+        "prompt_en": "the cars on the street are waiting for the traffic lights",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "the cars on the street are waiting for the traffic lights.jpg"
+    },
+    {
+        "prompt_en": "a bicycle leaning against a fence in the snow",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a bicycle leaning against a fence in the snow.jpg"
+    },
+    {
+        "prompt_en": "a blue fishing boat is navigating in the ocean next to a cruise ship",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a blue fishing boat is navigating in the ocean next to a cruise ship.jpg"
+    },
+    {
+        "prompt_en": "a blue car driving down a dirt road near train tracks",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a blue car driving down a dirt road near train tracks.jpg"
+    },
+    {
+        "prompt_en": "a sailboat is drifting on the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a sailboat is drifting on the ocean.jpg"
+    },
+    {
+        "prompt_en": "a couple of boats floating on a body of water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a couple of boats floating on a body of water.jpg"
+    },
+    {
+        "prompt_en": "a city street with cars driving in the rain",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a city street with cars driving in the rain.jpg"
+    },
+    {
+        "prompt_en": "a red and white tram traveling down a snowy street",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a red and white tram traveling down a snowy street.jpg"
+    },
+    {
+        "prompt_en": "a city bus driving down a snowy street at night",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a city bus driving down a snowy street at night.jpg"
+    },
+    {
+        "prompt_en": "a green toy car is sitting on the ground",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a green toy car is sitting on the ground.jpg"
+    },
+    {
+        "prompt_en": "a train traveling down tracks through the woods with leaves on the ground",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a train traveling down tracks through the woods with leaves on the ground.jpg"
+    },
+    {
+        "prompt_en": "a man in a small boat fishing in the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a man in a small boat fishing in the ocean.jpg"
+    },
+    {
+        "prompt_en": "an airplane is flying through the sky at sunset",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "an airplane is flying through the sky at sunset.jpg"
+    },
+    {
+        "prompt_en": "an old rusty car sits in the middle of a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "an old rusty car sits in the middle of a field.jpg"
+    },
+    {
+        "prompt_en": "a motorcycle driving down a road",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a motorcycle driving down a road.jpg"
+    },
+    {
+        "prompt_en": "a blue train traveling through a lush green area",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a blue train traveling through a lush green area.jpg"
+    },
+    {
+        "prompt_en": "a white car is swiftly driving on a dirt road near a bush, kicking up dust",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a white car is swiftly driving on a dirt road near a bush, kicking up dust.jpg"
+    },
+    {
+        "prompt_en": "a large cargo ship sailing in the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a large cargo ship sailing in the water.jpg"
+    },
+    {
+        "prompt_en": "the red Alfa sports car is speeding down the road",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "the red Alfa sports car is speeding down the road.jpg"
+    },
+    {
+        "prompt_en": "two cars that have been involved in a violent collision",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "two cars that have been involved in a violent collision.jpg"
+    },
+    {
+        "prompt_en": "a red double decker bus driving down a street",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a red double decker bus driving down a street.jpg"
+    },
+    {
+        "prompt_en": "A red sports car driving through sand, kicking up a large amount of dust",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "A red sports car driving through sand, kicking up a large amount of dust.jpg"
+    },
+    {
+        "prompt_en": "a yellow toy car parked on a rock near the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a yellow toy car parked on a rock near the water.jpg"
+    },
+    {
+        "prompt_en": "a space shuttle taking off into the sky",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a space shuttle taking off into the sky.jpg"
+    },
+    {
+        "prompt_en": "a steam train traveling through the woods",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a steam train traveling through the woods.jpg"
+    },
+    {
+        "prompt_en": "a group of buses parked at a bus station",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a group of buses parked at a bus station.jpg"
+    },
+    {
+        "prompt_en": "A bunch of cars are driving on a highway",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "A bunch of cars are driving on a highway.jpg"
+    },
+    {
+        "prompt_en": "a white and blue airplane flying in the sky",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "a white and blue airplane flying in the sky.jpg"
+    },
+    {
+        "prompt_en": "A space station orbited above the Earth",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "A space station orbited above the Earth.jpg"
+    },
+    {
+        "prompt_en": "A yellow boat is cruising in front of a bridge",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "transportation",
+        "image_name": "A yellow boat is cruising in front of a bridge.jpg"
+    },
+    {
+        "prompt_en": "tangerines in a metal bowl on a table",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "tangerines in a metal bowl on a table.jpg"
+    },
+    {
+        "prompt_en": "a shadow of a hand reaching for a leaf",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a shadow of a hand reaching for a leaf.jpg"
+    },
+    {
+        "prompt_en": "A teddy bear is climbing over a wooden fence",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "A teddy bear is climbing over a wooden fence.jpg"
+    },
+    {
+        "prompt_en": "a book on fire with flames coming out of it",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a book on fire with flames coming out of it.jpg"
+    },
+    {
+        "prompt_en": "a close-up of a pink rose with water droplets on it",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close-up of a pink rose with water droplets on it.jpg"
+    },
+    {
+        "prompt_en": "a person is cooking meat on a grill with flames",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person is cooking meat on a grill with flames.jpg"
+    },
+    {
+        "prompt_en": "a snowman wearing a santa hat and scarf",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a snowman wearing a santa hat and scarf.jpg"
+    },
+    {
+        "prompt_en": "a person holding a sparkler in their hand",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a person holding a sparkler in their hand.jpg"
+    },
+    {
+        "prompt_en": "a teddy bear sitting on a moss covered ground",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a teddy bear sitting on a moss covered ground.jpg"
+    },
+    {
+        "prompt_en": "a statue of a lion is sitting on a pedestal",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a statue of a lion is sitting on a pedestal.jpg"
+    },
+    {
+        "prompt_en": "metal balls are suspended in the air",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "metal balls are suspended in the air.jpg"
+    },
+    {
+        "prompt_en": "a close up of a bunch of green grapes",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close up of a bunch of green grapes.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a green plant with unfurled fronds",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close-up view of a green plant with unfurled fronds.jpg"
+    },
+    {
+        "prompt_en": "an orange mushroom sitting on top of a tree stump in the woods",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "an orange mushroom sitting on top of a tree stump in the woods.jpg"
+    },
+    {
+        "prompt_en": "a stack of pancakes covered in syrup and fruit",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a stack of pancakes covered in syrup and fruit.jpg"
+    },
+    {
+        "prompt_en": "a plate of spaghetti with spinach and tomatoes",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a plate of spaghetti with spinach and tomatoes.jpg"
+    },
+    {
+        "prompt_en": "a pink lotus flower in the middle of a pond",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a pink lotus flower in the middle of a pond.jpg"
+    },
+    {
+        "prompt_en": "a person holding a sparkler in front of a sunset",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a person holding a sparkler in front of a sunset.jpg"
+    },
+    {
+        "prompt_en": "a pink rose is blooming in a garden",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a pink rose is blooming in a garden.jpg"
+    },
+    {
+        "prompt_en": "a snow man holding a lantern in the snow",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "other",
+        "image_name": "a snow man holding a lantern in the snow.jpg"
+    },
+    {
+        "prompt_en": "a stack of chocolate cookies with a bite taken out of it",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a stack of chocolate cookies with a bite taken out of it.jpg"
+    },
+    {
+        "prompt_en": "a white plate topped with eggs, toast, tomatoes, and a sausage",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a white plate topped with eggs, toast, tomatoes, and a sausage.jpg"
+    },
+    {
+        "prompt_en": "a yellow water lily is floating in a pond",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a yellow water lily is floating in a pond.jpg"
+    },
+    {
+        "prompt_en": "an astronaut floating in space with the earth in the background",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "an astronaut floating in space with the earth in the background.jpg"
+    },
+    {
+        "prompt_en": "A little girl, lost in thought, is quietly sitting on the bus",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "A little girl, lost in thought, is quietly sitting on the bus.jpg"
+    },
+    {
+        "prompt_en": "a man holding a tray in front of a brick wall",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man holding a tray in front of a brick wall.jpg"
+    },
+    {
+        "prompt_en": "an older man playing a saxophone on the street",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "an older man playing a saxophone on the street.jpg"
+    },
+    {
+        "prompt_en": "an older man jogging by the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "an older man jogging by the water.jpg"
+    },
+    {
+        "prompt_en": "a person riding a skateboard on a concrete floor",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a person riding a skateboard on a concrete floor.jpg"
+    },
+    {
+        "prompt_en": "a woman with long black hair is posing for a picture",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman with long black hair is posing for a picture.jpg"
+    },
+    {
+        "prompt_en": "a woman sitting on the ground in front of a guitar",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman sitting on the ground in front of a guitar.jpg"
+    },
+    {
+        "prompt_en": "a little girl wearing a purple helmet riding a blue bike",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a little girl wearing a purple helmet riding a blue bike.jpg"
+    },
+    {
+        "prompt_en": "a young boy is jumping in the mud",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a young boy is jumping in the mud.jpg"
+    },
+    {
+        "prompt_en": "a man sitting in the driver's seat of a car wearing sunglasses",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man sitting in the driver's seat of a car wearing sunglasses.jpg"
+    },
+    {
+        "prompt_en": "a little boy jumping in the air over a puddle of water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a little boy jumping in the air over a puddle of water.jpg"
+    },
+    {
+        "prompt_en": "a woman with afro hair is smiling while wearing earphones",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman with afro hair is smiling while wearing earphones.jpg"
+    },
+    {
+        "prompt_en": "a smiling woman with her hands clasped",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a smiling woman with her hands clasped.jpg"
+    },
+    {
+        "prompt_en": "a young boy standing in a field with horses in the background",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a young boy standing in a field with horses in the background.jpg"
+    },
+    {
+        "prompt_en": "a young man is covered in colored powder",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a young man is covered in colored powder.jpg"
+    },
+    {
+        "prompt_en": "a woman with curly hair is drinking a beer",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman with curly hair is drinking a beer.jpg"
+    },
+    {
+        "prompt_en": "an old man standing in the middle of a field holding a bunch of plants",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "an old man standing in the middle of a field holding a bunch of plants.jpg"
+    },
+    {
+        "prompt_en": "a man standing on a boat with a net",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man standing on a boat with a net.jpg"
+    },
+    {
+        "prompt_en": "a woman in a hat is putting salt into a basket",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman in a hat is putting salt into a basket.jpg"
+    },
+    {
+        "prompt_en": "a young girl smelling a pink flower",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a young girl smelling a pink flower.jpg"
+    },
+    {
+        "prompt_en": "a young boy leaning on a wooden pole",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a young boy leaning on a wooden pole.jpg"
+    },
+    {
+        "prompt_en": "a man in a hat sitting in front of a brick oven",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man in a hat sitting in front of a brick oven.jpg"
+    },
+    {
+        "prompt_en": "a man in a mexican outfit holding an acoustic guitar",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man in a mexican outfit holding an acoustic guitar.jpg"
+    },
+    {
+        "prompt_en": "a snowboarder is in the air doing a trick",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a snowboarder is in the air doing a trick.jpg"
+    },
+    {
+        "prompt_en": "a man riding a horse with a spear in his hand",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man riding a horse with a spear in his hand.jpg"
+    },
+    {
+        "prompt_en": "a woman carrying a bundle of plants over their head",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman carrying a bundle of plants over their head.jpg"
+    },
+    {
+        "prompt_en": "a person jumping in the air over a fence",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a person jumping in the air over a fence.jpg"
+    },
+    {
+        "prompt_en": "a man on a surfboard riding a wave in the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man on a surfboard riding a wave in the ocean.jpg"
+    },
+    {
+        "prompt_en": "a man sitting on steps playing an acoustic guitar",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man sitting on steps playing an acoustic guitar.jpg"
+    },
+    {
+        "prompt_en": "a man swinging a tennis racquet at a tennis ball",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man swinging a tennis racquet at a tennis ball.jpg"
+    },
+    {
+        "prompt_en": "a man riding a mountain bike on top of a rocky hill",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man riding a mountain bike on top of a rocky hill.jpg"
+    },
+    {
+        "prompt_en": "a man riding a bike down a street",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man riding a bike down a street.jpg"
+    },
+    {
+        "prompt_en": "a man is running on a dirt road",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man is running on a dirt road.jpg"
+    },
+    {
+        "prompt_en": "A man in a black suit and a sombrero, shouting loudly",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "A man in a black suit and a sombrero, shouting loudly.jpg"
+    },
+    {
+        "prompt_en": "a man standing on top of a sand dune in the desert",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man standing on top of a sand dune in the desert.jpg"
+    },
+    {
+        "prompt_en": "a person riding a motorcycle down a road",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a person riding a motorcycle down a road.jpg"
+    },
+    {
+        "prompt_en": "a man standing on top of a mountain with a backpack",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man standing on top of a mountain with a backpack.jpg"
+    },
+    {
+        "prompt_en": "a man with a skull face paint smoking a cigar and holding a guitar",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man with a skull face paint smoking a cigar and holding a guitar.jpg"
+    },
+    {
+        "prompt_en": "a man in sunglasses laying on a wooden bench",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man in sunglasses laying on a wooden bench.jpg"
+    },
+    {
+        "prompt_en": "an older woman sitting in a room with a cigarette in her hand",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "an older woman sitting in a room with a cigarette in her hand.jpg"
+    },
+    {
+        "prompt_en": "a man sitting on the ground playing a musical instrument",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man sitting on the ground playing a musical instrument.jpg"
+    },
+    {
+        "prompt_en": "a person riding a horse in a polo match",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a person riding a horse in a polo match.jpg"
+    },
+    {
+        "prompt_en": "a woman in a kimono holding an umbrella",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman in a kimono holding an umbrella.jpg"
+    },
+    {
+        "prompt_en": "a person riding a dirt bike",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a person riding a dirt bike.jpg"
+    },
+    {
+        "prompt_en": "a person riding an atv on a dirt track",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a person riding an atv on a dirt track.jpg"
+    },
+    {
+        "prompt_en": "a person riding a wave on a surfboard",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a person riding a wave on a surfboard.jpg"
+    },
+    {
+        "prompt_en": "a woman in a wetsuit is swimming in the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman in a wetsuit is swimming in the ocean.jpg"
+    },
+    {
+        "prompt_en": "a man snorkling in the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a man snorkling in the ocean.jpg"
+    },
+    {
+        "prompt_en": "a beautiful woman in a blue sari posing in front of a wall",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a beautiful woman in a blue sari posing in front of a wall.jpg"
+    },
+    {
+        "prompt_en": "a woman wearing a shawl in front of a mountain",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman wearing a shawl in front of a mountain.jpg"
+    },
+    {
+        "prompt_en": "a woman is making bread in an oven",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman is making bread in an oven.jpg"
+    },
+    {
+        "prompt_en": "a woman smiles while holding a yellow flower",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman smiles while holding a yellow flower.jpg"
+    },
+    {
+        "prompt_en": "A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "A young boy is lifting a bundle of dry grass wrapped in waterproof fabric over his head.jpg"
+    },
+    {
+        "prompt_en": "two people performing a sword fight in front of a forest",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two people performing a sword fight in front of a forest.jpg"
+    },
+    {
+        "prompt_en": "a woman in a colorful shirt is cooking food",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman in a colorful shirt is cooking food.jpg"
+    },
+    {
+        "prompt_en": "an older woman is drinking a bottle of water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "an older woman is drinking a bottle of water.jpg"
+    },
+    {
+        "prompt_en": "a smiling woman sitting at a table with food and drinks",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a smiling woman sitting at a table with food and drinks.jpg"
+    },
+    {
+        "prompt_en": "a woman wearing a hijab reading a book on the beach",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman wearing a hijab reading a book on the beach.jpg"
+    },
+    {
+        "prompt_en": "a woman wearing a headscarf is reaching for an olive tree",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman wearing a headscarf is reaching for an olive tree.jpg"
+    },
+    {
+        "prompt_en": "a woman in a white dress jumping in the air in a field of pink flowers",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman in a white dress jumping in the air in a field of pink flowers.jpg"
+    },
+    {
+        "prompt_en": "a woman wearing a conical hat sits on a boat",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman wearing a conical hat sits on a boat.jpg"
+    },
+    {
+        "prompt_en": "an older woman sitting in front of an old building",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "an older woman sitting in front of an old building.jpg"
+    },
+    {
+        "prompt_en": "a woman is praying in front of a buddhist temple",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman is praying in front of a buddhist temple.jpg"
+    },
+    {
+        "prompt_en": "a woman with green hair smiling for the camera",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "single-human",
+        "image_name": "a woman with green hair smiling for the camera.jpg"
+    },
+    {
+        "prompt_en": "A group of people in a yellow raft is rowing through turbulent waters",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "A group of people in a yellow raft is rowing through turbulent waters.jpg"
+    },
+    {
+        "prompt_en": "a man carrying a woman on his back in a field",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man carrying a woman on his back in a field.jpg"
+    },
+    {
+        "prompt_en": "an indian police officer talking to an old woman",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "an indian police officer talking to an old woman.jpg"
+    },
+    {
+        "prompt_en": "two people scuba diving in the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two people scuba diving in the ocean.jpg"
+    },
+    {
+        "prompt_en": "A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "A man and woman dressed as sugar skulls in a field of flowers, sharing a loving gaze with each other.jpg"
+    },
+    {
+        "prompt_en": "a group of people watching a cow race",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a group of people watching a cow race.jpg"
+    },
+    {
+        "prompt_en": "a man and a child riding bumper cars in an amusement park",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man and a child riding bumper cars in an amusement park.jpg"
+    },
+    {
+        "prompt_en": "a group of motorcyclists racing on a dirt track",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a group of motorcyclists racing on a dirt track.jpg"
+    },
+    {
+        "prompt_en": "a man and a woman are boxing in a boxing ring",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man and a woman are boxing in a boxing ring.jpg"
+    },
+    {
+        "prompt_en": "a man holding a baby in his arms",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man holding a baby in his arms.jpg"
+    },
+    {
+        "prompt_en": "a man and a woman sitting on a bench playing instruments",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man and a woman sitting on a bench playing instruments.jpg"
+    },
+    {
+        "prompt_en": "two men are standing next to each other with a bicycle",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two men are standing next to each other with a bicycle.jpg"
+    },
+    {
+        "prompt_en": "a man and a boy sitting on a beach near the ocean",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man and a boy sitting on a beach near the ocean.jpg"
+    },
+    {
+        "prompt_en": "two men in white clothing standing next to each other",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two men in white clothing standing next to each other.jpg"
+    },
+    {
+        "prompt_en": "a group of men riding horses in a dusty arena",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a group of men riding horses in a dusty arena.jpg"
+    },
+    {
+        "prompt_en": "a soccer player in a yellow and black shirt is chasing a soccer ball",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a soccer player in a yellow and black shirt is chasing a soccer ball.jpg"
+    },
+    {
+        "prompt_en": "a group of women sitting on the steps of a building",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a group of women sitting on the steps of a building.jpg"
+    },
+    {
+        "prompt_en": "a group of people gathered around a red checkered blanket",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a group of people gathered around a red checkered blanket.jpg"
+    },
+    {
+        "prompt_en": "a group of people in orange jumpsuits running along a river",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a group of people in orange jumpsuits running along a river.jpg"
+    },
+    {
+        "prompt_en": "a woman walking down a sidewalk with a bag",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a woman walking down a sidewalk with a bag.jpg"
+    },
+    {
+        "prompt_en": "a busy street with cars and people on motorcycles",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a busy street with cars and people on motorcycles.jpg"
+    },
+    {
+        "prompt_en": "a man in a mask is walking through a crowd of people",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man in a mask is walking through a crowd of people.jpg"
+    },
+    {
+        "prompt_en": "a man and a woman walking under an umbrella next to a brick wall",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a man and a woman walking under an umbrella next to a brick wall.jpg"
+    },
+    {
+        "prompt_en": "a group of people riding bikes down a street",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "a group of people riding bikes down a street.jpg"
+    },
+    {
+        "prompt_en": "An old person is holding a cup on the street, and people around are curiously looking at him",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "An old person is holding a cup on the street, and people around are curiously looking at him.jpg"
+    },
+    {
+        "prompt_en": "two young girls playing with leaves in the woods",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two young girls playing with leaves in the woods.jpg"
+    },
+    {
+        "prompt_en": "One person is riding on the back of a horse led by another person",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "One person is riding on the back of a horse led by another person.jpg"
+    },
+    {
+        "prompt_en": "an older woman and a young girl are knitting together",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "an older woman and a young girl are knitting together.jpg"
+    },
+    {
+        "prompt_en": "three geishas walking down the street in traditional clothing",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "three geishas walking down the street in traditional clothing.jpg"
+    },
+    {
+        "prompt_en": "two men riding bikes down a road near a forest",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two men riding bikes down a road near a forest.jpg"
+    },
+    {
+        "prompt_en": "two women carrying bowls on their heads",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two women carrying bowls on their heads.jpg"
+    },
+    {
+        "prompt_en": "two women eating pizza at a restaurant",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two women eating pizza at a restaurant.jpg"
+    },
+    {
+        "prompt_en": "two young women studying in a library",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "multiple-human",
+        "image_name": "two young women studying in a library.jpg"
+    },
+    {
+        "prompt_en": "pink water lilies in a pond with leaves",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "pink water lilies in a pond with leaves.jpg"
+    },
+    {
+        "prompt_en": "a group of succulents in a rock garden",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a group of succulents in a rock garden.jpg"
+    },
+    {
+        "prompt_en": "a close up view of a bunch of snowdrop flowers",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close up view of a bunch of snowdrop flowers.jpg"
+    },
+    {
+        "prompt_en": "a close up of leaves with water droplets on them",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close up of leaves with water droplets on them.jpg"
+    },
+    {
+        "prompt_en": "a close-up of a sea anemone in the water",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close-up of a sea anemone in the water.jpg"
+    },
+    {
+        "prompt_en": "a plant with water droplets on it",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a plant with water droplets on it.jpg"
+    },
+    {
+        "prompt_en": "a group of cactus plants in the desert",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a group of cactus plants in the desert.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a plant with spiky leaves",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close-up view of a plant with spiky leaves.jpg"
+    },
+    {
+        "prompt_en": "A budding and blossoming flower bud seedling",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "A budding and blossoming flower bud seedling.jpg"
+    },
+    {
+        "prompt_en": "a field of orange flowers near the ocean'",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a field of orange flowers near the ocean'.jpg"
+    },
+    {
+        "prompt_en": "a close-up view of a bunch of pink flowers",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a close-up view of a bunch of pink flowers.jpg"
+    },
+    {
+        "prompt_en": "pink water lilies in a pond",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "pink water lilies in a pond.jpg"
+    },
+    {
+        "prompt_en": "reeds blowing in the wind against a cloudy sky",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "reeds blowing in the wind against a cloudy sky.jpg"
+    },
+    {
+        "prompt_en": "two tall cacti in the middle of the desert",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "two tall cacti in the middle of the desert.jpg"
+    },
+    {
+        "prompt_en": "a sea anemone on a coral reef",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a sea anemone on a coral reef.jpg"
+    },
+    {
+        "prompt_en": "a dandelion blowing in the wind",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "plant",
+        "image_name": "a dandelion blowing in the wind.jpg"
+    },
+    {
+        "prompt_en": "A boiling pot cooking vegetables",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "A boiling pot cooking vegetables.jpg"
+    },
+    {
+        "prompt_en": "a woman stirring food in a pan on the stove",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a woman stirring food in a pan on the stove.jpg"
+    },
+    {
+        "prompt_en": "two eggs are fried in a frying pan on the stove",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "two eggs are fried in a frying pan on the stove.jpg"
+    },
+    {
+        "prompt_en": "fried onion rings in a basket",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "fried onion rings in a basket.jpg"
+    },
+    {
+        "prompt_en": "a pot is sitting on top of a campfire",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a pot is sitting on top of a campfire.jpg"
+    },
+    {
+        "prompt_en": "a chef is preparing a dish with mushrooms on a wooden board",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a chef is preparing a dish with mushrooms on a wooden board.jpg"
+    },
+    {
+        "prompt_en": "a hand holding a slice of pizza",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a hand holding a slice of pizza.jpg"
+    },
+    {
+        "prompt_en": "A person is using tongs to pick up meat from a plate",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "A person is using tongs to pick up meat from a plate.jpg"
+    },
+    {
+        "prompt_en": "The meat is picked up from the grill with tongs",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "The meat is picked up from the grill with tongs.jpg"
+    },
+    {
+        "prompt_en": "A person is whisking eggs, and the egg whites and yolks are gently streaming out",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "A person is whisking eggs, and the egg whites and yolks are gently streaming out.jpg"
+    },
+    {
+        "prompt_en": "a person is putting sauce on a burger",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person is putting sauce on a burger.jpg"
+    },
+    {
+        "prompt_en": "A person is making dumplings",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "A person is making dumplings.jpg"
+    },
+    {
+        "prompt_en": "a pan filled with fried food",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a pan filled with fried food.jpg"
+    },
+    {
+        "prompt_en": "Chopsticks are slowly picking up the buns from the plastic container",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "Chopsticks are slowly picking up the buns from the plastic container.jpg"
+    },
+    {
+        "prompt_en": "a basket of french fries in a fryer",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a basket of french fries in a fryer.jpg"
+    },
+    {
+        "prompt_en": "a table with lobsters and drinks on it",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a table with lobsters and drinks on it.jpg"
+    },
+    {
+        "prompt_en": "a person pouring coffee into a pot on a stove",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person pouring coffee into a pot on a stove.jpg"
+    },
+    {
+        "prompt_en": "a kettle is sitting on top of a campfire",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a kettle is sitting on top of a campfire.jpg"
+    },
+    {
+        "prompt_en": "Chopsticks are picking up noodles from the bowl",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "Chopsticks are picking up noodles from the bowl.jpg"
+    },
+    {
+        "prompt_en": "a person is cooking eggs on an outdoor grill",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person is cooking eggs on an outdoor grill.jpg"
+    },
+    {
+        "prompt_en": "a person is cooking food in a wok on a stove",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person is cooking food in a wok on a stove.jpg"
+    },
+    {
+        "prompt_en": "a person is holding up a burger with his hands",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person is holding up a burger with his hands.jpg"
+    },
+    {
+        "prompt_en": "A person is pouring water into a teacup",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "A person is pouring water into a teacup.jpg"
+    },
+    {
+        "prompt_en": "a person pouring seasoning into a pot of food",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person pouring seasoning into a pot of food.jpg"
+    },
+    {
+        "prompt_en": "a person holding a taco in their hand",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person holding a taco in their hand.jpg"
+    },
+    {
+        "prompt_en": "a person slicing salmon on a cutting board",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person slicing salmon on a cutting board.jpg"
+    },
+    {
+        "prompt_en": "a bunch of food is cooking on a grill over an open fire",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a bunch of food is cooking on a grill over an open fire.jpg"
+    },
+    {
+        "prompt_en": "a close up of a piece of sushi on chopsticks",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a close up of a piece of sushi on chopsticks.jpg"
+    },
+    {
+        "prompt_en": "a group of pots on a stove with flames in the background",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a group of pots on a stove with flames in the background.jpg"
+    },
+    {
+        "prompt_en": "a person cooking vegetables in a pan on a stove",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person cooking vegetables in a pan on a stove.jpg"
+    },
+    {
+        "prompt_en": "a large pot of soup filled with vegetables and meat",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a large pot of soup filled with vegetables and meat.jpg"
+    },
+    {
+        "prompt_en": "a person holding chopsticks over a bowl of food",
+        "dimension": [
+            "i2v_subject",
+            "subject_consistency",
+            "motion_smoothness",
+            "dynamic_degree",
+            "aesthetic_quality",
+            "imaging_quality",
+            "temporal_flickering"
+        ],
+        "image_type": "food",
+        "image_name": "a person holding chopsticks over a bowl of food.jpg"
+    }
+]
diff --git a/gradio/app.py b/gradio/app.py
new file mode 100644
index 0000000..d6ddbcb
--- /dev/null
+++ b/gradio/app.py
@@ -0,0 +1,758 @@
+#!/usr/bin/env python
+"""
+This script runs a Gradio App for the Open-Sora model.
+
+Usage:
+    python demo.py <config-path>
+"""
+
+import argparse
+import datetime
+import importlib
+import os
+import subprocess
+import sys
+from tempfile import NamedTemporaryFile
+
+import spaces
+import torch
+
+import gradio as gr
+
+MODEL_TYPES = ["v1.3"]
+WATERMARK_PATH = "./assets/images/watermark/watermark.png"
+CONFIG_MAP = {
+    "v1.3": "configs/opensora-v1-3/inference/t2v.py",
+    "v1.3_i2v": "configs/opensora-v1-3/inference/v2v.py",
+}
+HF_STDIT_MAP = {
+    "t2v": {
+        "360p": "hpcaitech/OpenSora-STDiT-v4-360p",
+        "720p": "hpcaitech/OpenSora-STDiT-v4",
+    },
+    "i2v": "hpcaitech/OpenSora-STDiT-v4-i2v",
+}
+
+
+# ============================
+# Prepare Runtime Environment
+# ============================
+def install_dependencies(enable_optimization=False):
+    """
+    Install the required dependencies for the demo if they are not already installed.
+    """
+
+    def _is_package_available(name) -> bool:
+        try:
+            importlib.import_module(name)
+            return True
+        except (ImportError, ModuleNotFoundError):
+            return False
+
+    if enable_optimization:
+        # install flash attention
+        if not _is_package_available("flash_attn"):
+            subprocess.run(
+                f"{sys.executable} -m pip install flash-attn --no-build-isolation",
+                env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+                shell=True,
+            )
+
+        # install apex for fused layernorm
+        if not _is_package_available("apex"):
+            subprocess.run(
+                f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git',
+                shell=True,
+            )
+
+        # install ninja
+        if not _is_package_available("ninja"):
+            subprocess.run(f"{sys.executable} -m pip install ninja", shell=True)
+
+        # install xformers
+        if not _is_package_available("xformers"):
+            subprocess.run(
+                f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers",
+                shell=True,
+            )
+
+
+# ============================
+# Model-related
+# ============================
+def read_config(config_path):
+    """
+    Read the configuration file.
+    """
+    from mmengine.config import Config
+
+    return Config.fromfile(config_path)
+
+
+def build_models(mode, resolution, enable_optimization=False):
+    """
+    Build the models for the given mode, resolution, and configuration.
+    """
+    # build vae
+    from opensora.registry import MODELS, build_module
+
+    if mode == "i2v":
+        config = read_config(CONFIG_MAP["v1.3_i2v"])
+    else:
+        config = read_config(CONFIG_MAP["v1.3"])
+
+    vae = build_module(config.vae, MODELS).cuda()
+
+    # build text encoder
+    text_encoder = build_module(config.text_encoder, MODELS)  # T5 must be fp32
+    text_encoder.t5.model = text_encoder.t5.model.cuda()
+
+    # Determine model weights based on mode and resolution
+    if mode == "i2v":
+        weight_path = HF_STDIT_MAP["i2v"]
+    else:  # t2v
+        weight_path = HF_STDIT_MAP["t2v"].get(resolution, None)
+        if not weight_path:
+            raise ValueError(f"Unsupported resolution {resolution} for mode {mode}")
+
+    # build stdit
+    from opensora.models.stdit.stdit3 import STDiT3
+
+    model_kwargs = {k: v for k, v in config.model.items() if k not in ("type", "from_pretrained", "force_huggingface")}
+
+    print("Load STDIT3 from ", weight_path)
+    stdit = STDiT3.from_pretrained(weight_path, **model_kwargs).cuda()
+
+    # build scheduler
+    from opensora.registry import SCHEDULERS
+
+    scheduler = build_module(config.scheduler, SCHEDULERS)
+
+    # hack for classifier-free guidance
+    text_encoder.y_embedder = stdit.y_embedder
+
+    # move models to device
+    vae = vae.to(torch.bfloat16).eval()
+    text_encoder.t5.model = text_encoder.t5.model.eval()  # t5 must be in fp32
+    stdit = stdit.to(torch.bfloat16).eval()
+
+    # clear cuda
+    torch.cuda.empty_cache()
+    return vae, text_encoder, stdit, scheduler, config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-type",
+        default="v1.3",
+        choices=MODEL_TYPES,
+        help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
+    )
+    parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
+    parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
+    parser.add_argument("--host", default="0.0.0.0", type=str, help="The host to run the Gradio App on.")
+    parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
+    parser.add_argument(
+        "--enable-optimization",
+        action="store_true",
+        help="Whether to enable optimization such as flash attention and fused layernorm",
+    )
+    return parser.parse_args()
+
+
+# ============================
+# Main Gradio Script
+# ============================
+# as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text
+# so we can't pass the models to `run_inference` as arguments.
+# instead, we need to define them globally so that we can access these models inside `run_inference`
+
+# read config
+args = parse_args()
+config = read_config(CONFIG_MAP[args.model_type])
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+# make outputs dir
+os.makedirs(args.output, exist_ok=True)
+
+# disable torch jit as it can cause failure in gradio SDK
+# gradio sdk uses torch with cuda 11.3
+torch.jit._state.disable()
+
+# set up
+install_dependencies(enable_optimization=args.enable_optimization)
+
+# import after installation
+from opensora.datasets import IMG_FPS, save_sample
+from opensora.datasets.aspect import get_image_size, get_num_frames
+from opensora.models.text_encoder.t5 import text_preprocessing
+from opensora.utils.inference_utils import (
+    add_watermark,
+    append_generated,
+    append_score_to_prompts,
+    apply_mask_strategy,
+    collect_references_batch,
+    dframe_to_frame,
+    extract_json_from_prompts,
+    extract_prompts_loop,
+    get_random_prompt_by_openai,
+    has_openai_key,
+    merge_prompt,
+    prepare_multi_resolution_info,
+    refine_prompts_by_openai,
+    split_prompt,
+    prep_ref_and_update_mask_in_loop,
+    prep_ref_and_mask
+)
+from opensora.utils.misc import to_torch_dtype
+
+# some global variables
+dtype = to_torch_dtype(config.dtype)
+device = torch.device("cuda")
+
+# build model
+def initialize_models(mode, resolution):
+    return build_models(mode, resolution, enable_optimization=args.enable_optimization)
+
+
+def run_inference(
+    mode,
+    prompt_text,
+    resolution,
+    aspect_ratio,
+    length,
+    motion_strength,
+    aesthetic_score,
+    use_motion_strength,
+    use_aesthetic_score,
+    camera_motion,
+    reference_image,
+    refine_prompt,
+    fps,
+    num_loop,
+    seed,
+    sampling_steps,
+    cfg_scale,
+):
+    if prompt_text is None or prompt_text == "":
+        gr.Warning("Your prompt is empty, please enter a valid prompt")
+        return None
+
+    # Dynamically choose mode based on reference image
+    if reference_image is not None and mode != "Text2Image":
+        mode = "i2v"
+
+    # Initialize models
+    vae, text_encoder, stdit, scheduler, config = initialize_models(mode, resolution)
+
+    torch.manual_seed(seed)
+    with torch.inference_mode():
+        # ======================
+        # 1. Preparation arguments
+        # ======================
+        # parse the inputs
+        # frame_interval must be 1 so  we ignore it here
+        image_size = get_image_size(resolution, aspect_ratio)
+
+        use_sdedit = config.get("use_sdedit", False)
+        use_oscillation_guidance_for_text = config.get("use_oscillation_guidance_for_text", None)
+        use_oscillation_guidance_for_image = config.get("use_oscillation_guidance_for_image", None)
+
+        cond_type = config.get("cond_type", None)
+        cond_type = None if cond_type == "none" else cond_type
+        mask_index = None
+        ref = None
+        image_cfg_scale = None
+
+        # compute generation parameters
+        if mode == "Text2Image":
+            num_frames = 1
+            fps = IMG_FPS
+        else:
+            num_frames = config.num_frames
+            num_frames = get_num_frames(length)
+
+        condition_frame_length = config.get("condition_frame_length", 5)
+        condition_frame_edit = config.get("condition_frame_edit", 0.0)
+
+        input_size = (num_frames, *image_size)
+        latent_size = vae.get_latent_size(input_size)
+        multi_resolution = "OpenSora"
+        align = 5
+
+        # == prepare mask strategy ==
+        if mode == "Text2Image":
+            mask_strategy = [None]
+            mask_index = []
+        elif mode == "Text2Video":
+            if reference_image is not None:
+                mask_strategy = ["0"]
+                mask_index = [0]
+            else:
+                mask_strategy = [None]
+                mask_index = []
+        elif mode == "i2v":
+            mask_strategy = ["0"]
+            mask_index = [0]
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+
+        # == prepare reference ==
+        if mode == "Text2Image":
+            refs = [""]
+        elif mode == "Text2Video":
+            if reference_image is not None:
+                # save image to disk
+                from PIL import Image
+
+                im = Image.fromarray(reference_image)
+                temp_file = NamedTemporaryFile(suffix=".png")
+                im.save(temp_file.name)
+                refs = [temp_file.name]
+            else:
+                refs = [""]
+        elif mode == "i2v":
+            if reference_image is not None:
+                # save image to disk
+                from PIL import Image
+
+                im = Image.fromarray(reference_image)
+                temp_file = NamedTemporaryFile(suffix=".png")
+                im.save(temp_file.name)
+                refs = [temp_file.name]
+            else:
+                refs = [""]
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+
+        # == get json from prompts ==
+        batch_prompts = [prompt_text]
+        batch_prompts, refs, mask_strategy = extract_json_from_prompts(batch_prompts, refs, mask_strategy)
+
+        # == get reference for condition ==
+        refs = collect_references_batch(refs, vae, image_size)
+
+        target_shape = [len(batch_prompts), vae.out_channels, *latent_size]
+        if mode == "i2v":
+            image_cfg_scale = config.get("image_cfg_scale", 7.5)
+            ref, mask_index = prep_ref_and_mask(
+                cond_type, condition_frame_length, refs, target_shape, num_loop, device, dtype
+            )
+
+        # == multi-resolution info ==
+        model_args = prepare_multi_resolution_info(
+            multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype
+        )
+
+        # == process prompts step by step ==
+        # 0. split prompt
+        # each element in the list is [prompt_segment_list, loop_idx_list]
+        batched_prompt_segment_list = []
+        batched_loop_idx_list = []
+        for prompt in batch_prompts:
+            prompt_segment_list, loop_idx_list = split_prompt(prompt)
+            batched_prompt_segment_list.append(prompt_segment_list)
+            batched_loop_idx_list.append(loop_idx_list)
+
+        # 1. refine prompt by openai
+        if refine_prompt:
+            # check if openai key is provided
+            if not has_openai_key():
+                gr.Warning("OpenAI API key is not provided, the prompt will not be enhanced.")
+            else:
+                for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
+                    batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list)
+
+        # process scores
+        aesthetic_score = aesthetic_score if use_aesthetic_score else None
+        motion_strength = motion_strength if use_motion_strength and mode != "Text2Image" else None
+        camera_motion = None if camera_motion == "none" or mode == "Text2Image" else camera_motion
+        # 2. append score
+        for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
+            batched_prompt_segment_list[idx] = append_score_to_prompts(
+                prompt_segment_list,
+                aes=aesthetic_score,
+                flow=motion_strength,
+                camera_motion=camera_motion,
+            )
+
+        # 3. clean prompt with T5
+        for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
+            batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list]
+
+        # 4. merge to obtain the final prompt
+        batch_prompts = []
+        for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list):
+            batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list))
+
+        # =========================
+        # Generate image/video
+        # =========================
+        video_clips = []
+        for loop_i in range(num_loop):
+            # 4.4 sample in hidden space
+            batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)
+
+            # == loop ==
+            # if loop_i > 0:
+            #     refs, mask_strategy = append_generated(
+            #         vae, video_clips[-1], refs, mask_strategy, loop_i, condition_frame_length, condition_frame_edit
+            #     )
+
+            # == sampling ==
+            z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
+            masks = apply_mask_strategy(z, refs, mask_strategy, loop_i, align=align) if mask_index is None else None
+            x_cond_mask = torch.zeros(len(batch_prompts), vae.out_channels, *latent_size, device=device).to(dtype) if mask_index is not None else None
+            if x_cond_mask is not None and mask_index is not None:
+                    x_cond_mask[:, :, mask_index, :, :] = 1.0
+
+            # 4.6. diffusion sampling
+            # hack to update num_sampling_steps and cfg_scale
+            scheduler_kwargs = config.scheduler.copy()
+            scheduler_kwargs.pop("type")
+            scheduler_kwargs["num_sampling_steps"] = sampling_steps
+            scheduler_kwargs["cfg_scale"] = cfg_scale
+
+            scheduler.__init__(**scheduler_kwargs)
+            samples = scheduler.sample(
+                stdit,
+                text_encoder,
+                z=z,
+                z_cond=ref,
+                z_cond_mask=x_cond_mask,
+                prompts=batch_prompts_loop,
+                device=device,
+                additional_args=model_args,
+                progress=True,
+                mask=masks,
+                mask_index=mask_index,
+                image_cfg_scale=image_cfg_scale,
+                use_sdedit=use_sdedit,
+                use_oscillation_guidance_for_text=use_oscillation_guidance_for_text,
+                use_oscillation_guidance_for_image=use_oscillation_guidance_for_image,
+            )
+
+            if loop_i > 1:  # process conditions for subsequent loop
+                    if cond_type is not None:  # i2v or v2v
+                        is_last_loop = loop_i == loop_i - 1
+                        ref, mask_index = prep_ref_and_update_mask_in_loop(
+                            cond_type,
+                            condition_frame_length,
+                            samples,
+                            refs,
+                            target_shape,
+                            is_last_loop,
+                            device,
+                            dtype,
+                        )
+
+                    else:
+                        refs, mask_strategy = append_generated(
+                            vae,
+                            samples,
+                            refs,
+                            mask_strategy,
+                            loop_i,
+                            condition_frame_length,
+                            condition_frame_edit,
+                            is_latent=True,
+                        )
+
+            # samples = vae.decode(samples.to(dtype), num_frames=num_frames)
+            video_clips.append(samples)
+
+        # =========================
+        # Save output
+        # =========================
+        video_clips = [val[0] for val in video_clips]
+        for i in range(1, num_loop):
+            video_clips[i] = video_clips[i][:, condition_frame_length:]
+        video = torch.cat(video_clips, dim=1)
+
+        t_cut = max(video.size(1) // 5 * 5, 1)
+        if t_cut < video.size(1):
+            video = video[:, :t_cut]
+        
+        video = vae.decode(video.to(dtype), num_frames=t_cut * 17 // 5).squeeze(0)
+
+        current_datetime = datetime.datetime.now()
+        timestamp = current_datetime.timestamp()
+
+        save_path = os.path.join(args.output, f"output_{timestamp}")
+        saved_path = save_sample(video, save_path=save_path, fps=24)
+        torch.cuda.empty_cache()
+
+        # add watermark
+        if mode != "Text2Image" and os.path.exists(WATERMARK_PATH):
+            watermarked_path = saved_path.replace(".mp4", "_watermarked.mp4")
+            success = add_watermark(saved_path, WATERMARK_PATH, watermarked_path)
+            if success:
+                return watermarked_path
+            else:
+                return saved_path
+        else:
+            return saved_path
+
+
+
+@spaces.GPU(duration=200)
+def run_image_inference(
+    prompt_text,
+    resolution,
+    aspect_ratio,
+    length,
+    motion_strength,
+    aesthetic_score,
+    use_motion_strength,
+    use_aesthetic_score,
+    camera_motion,
+    reference_image,
+    refine_prompt,
+    fps,
+    num_loop,
+    seed,
+    sampling_steps,
+    cfg_scale,
+):
+    return run_inference(
+                "Text2Image",
+                prompt_text,
+                resolution,
+                aspect_ratio,
+                length,
+                motion_strength,
+                aesthetic_score,
+                use_motion_strength,
+                use_aesthetic_score,
+                camera_motion,
+                reference_image,
+                refine_prompt,
+                fps,
+                num_loop,
+                seed,
+                sampling_steps,
+                cfg_scale,
+            )
+
+
+@spaces.GPU(duration=200)
+def run_video_inference(
+    prompt_text,
+    resolution,
+    aspect_ratio,
+    length,
+    motion_strength,
+    aesthetic_score,
+    use_motion_strength,
+    use_aesthetic_score,
+    camera_motion,
+    reference_image,
+    refine_prompt,
+    fps,
+    num_loop,
+    seed,
+    sampling_steps,
+    cfg_scale,
+):
+    # if (resolution == "480p" and length == "16s") or \
+    #     (resolution == "720p" and length in ["8s", "16s"]):
+    #     gr.Warning("Generation is interrupted as the combination of 480p and 16s will lead to CUDA out of memory")
+    # else:
+    return run_inference(
+        "Text2Video",
+        prompt_text,
+        resolution,
+        aspect_ratio,
+        length,
+        motion_strength,
+        aesthetic_score,
+        use_motion_strength,
+        use_aesthetic_score,
+        camera_motion,
+        reference_image,
+        refine_prompt,
+        fps,
+        num_loop,
+        seed,
+        sampling_steps,
+        cfg_scale,
+    )
+
+
+def generate_random_prompt():
+    if "OPENAI_API_KEY" not in os.environ:
+        gr.Warning("Your prompt is empty and the OpenAI API key is not provided, please enter a valid prompt")
+        return None
+    else:
+        prompt_text = get_random_prompt_by_openai()
+        return prompt_text
+
+
+def main():
+    # create demo
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column():
+                gr.HTML(
+                    """
+                <div style='text-align: center;'>
+                    <p align="center">
+                        <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/icon.png" width="250"/>
+                    </p>
+                    <div style="display: flex; gap: 10px; justify-content: center;">
+                        <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
+                        <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
+                        <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
+                        <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
+                        <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
+                        <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
+                        <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
+                    </div>
+                    <h1 style='margin-top: 5px;'>Open-Sora: Democratizing Efficient Video Production for All</h1>
+                </div>
+                """
+                )
+
+        with gr.Row():
+            with gr.Column():
+                prompt_text = gr.Textbox(label="Prompt", placeholder="Describe your video here", lines=4)
+                refine_prompt = gr.Checkbox(
+                    value=has_openai_key(), label="Refine prompt with GPT4o", interactive=has_openai_key()
+                )
+                random_prompt_btn = gr.Button("Random Prompt By GPT4o", interactive=has_openai_key())
+
+                gr.Markdown("## Basic Settings")
+                resolution = gr.Radio(
+                    choices=["360p", "720p"],
+                    value="720p",
+                    label="Resolution",
+                )
+                aspect_ratio = gr.Radio(
+                    choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
+                    value="9:16",
+                    label="Aspect Ratio (H:W)",
+                )
+                length = gr.Radio(
+                    choices=[1, 49, 65, 81, 97, 113],
+                    value=97,
+                    label="Video Length (Number of Frames)",
+                    info="Setting the number of frames to 1 indicates image generation instead of video generation.",
+                )
+
+                with gr.Row():
+                    seed = gr.Slider(value=1024, minimum=1, maximum=2048, step=1, label="Seed")
+
+                    sampling_steps = gr.Slider(value=30, minimum=1, maximum=200, step=1, label="Sampling steps")
+                    cfg_scale = gr.Slider(value=7.0, minimum=0.0, maximum=10.0, step=0.1, label="CFG Scale")
+
+                with gr.Row():
+                    with gr.Column():
+                        motion_strength = gr.Radio(
+                            choices=["very low", "low", "fair", "high", "very high", "extremely high"],
+                            value="fair",
+                            label="Motion Strength",
+                            info="Only effective for video generation",
+                        )
+                        use_motion_strength = gr.Checkbox(value=True, label="Enable")
+
+                    with gr.Column():
+                        aesthetic_score = gr.Radio(
+                            choices=["terrible", "very poor", "poor", "fair", "good", "very good", "excellent"],
+                            value="excellent",
+                            label="Aesthetic",
+                            info="Effective for text & video generation",
+                        )
+                        use_aesthetic_score = gr.Checkbox(value=True, label="Enable")
+
+                camera_motion = gr.Radio(
+                    value="none",
+                    label="Camera Motion",
+                    choices=["none", "pan right", "pan left", "tilt up", "tilt down", "zoom in", "zoom out", "static"],
+                    interactive=True,
+                )
+
+                gr.Markdown("## Advanced Settings")
+                with gr.Row():
+                    fps = gr.Slider(
+                        value=24,
+                        minimum=1,
+                        maximum=60,
+                        step=1,
+                        label="FPS",
+                        info="This is the frames per seconds for video generation, keep it to 24 if you are not sure",
+                    )
+                    num_loop = gr.Slider(
+                        value=1,
+                        minimum=1,
+                        maximum=20,
+                        step=1,
+                        label="Number of Loops",
+                        info="This will change the length of the generated video, keep it to 1 if you are not sure",
+                    )
+
+                gr.Markdown("## Reference Image")
+                reference_image = gr.Image(label="Image (optional)", show_download_button=True)
+
+            with gr.Column():
+                output_video = gr.Video(label="Output Video", height="100%")
+
+        with gr.Row():
+            image_gen_button = gr.Button("Generate image")
+            video_gen_button = gr.Button("Generate video")
+
+        image_gen_button.click(
+            fn=run_image_inference,
+            inputs=[
+                prompt_text,
+                resolution,
+                aspect_ratio,
+                length,
+                motion_strength,
+                aesthetic_score,
+                use_motion_strength,
+                use_aesthetic_score,
+                camera_motion,
+                reference_image,
+                refine_prompt,
+                fps,
+                num_loop,
+                seed,
+                sampling_steps,
+                cfg_scale,
+            ],
+            outputs=reference_image,
+        )
+
+        video_gen_button.click(
+            fn=run_video_inference,
+            inputs=[
+                prompt_text,
+                resolution,
+                aspect_ratio,
+                length,
+                motion_strength,
+                aesthetic_score,
+                use_motion_strength,
+                use_aesthetic_score,
+                camera_motion,
+                reference_image,
+                refine_prompt,
+                fps,
+                num_loop,
+                seed,
+                sampling_steps,
+                cfg_scale,
+            ],
+            outputs=output_video,
+        )
+        random_prompt_btn.click(fn=generate_random_prompt, outputs=prompt_text)
+
+    # launch
+    demo.queue(max_size=5, default_concurrency_limit=1)
+    demo.launch(server_port=args.port, server_name=args.host, share=args.share, max_threads=1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/opensora/__init__.py b/opensora/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/opensora/acceleration/__init__.py b/opensora/acceleration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/opensora/acceleration/checkpoint.py b/opensora/acceleration/checkpoint.py
new file mode 100644
index 0000000..f5ba325
--- /dev/null
+++ b/opensora/acceleration/checkpoint.py
@@ -0,0 +1,271 @@
+import warnings
+from collections.abc import Iterable
+from typing import Callable, ContextManager, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from colossalai.utils import get_current_device
+from torch.utils.checkpoint import (
+    _DEFAULT_DETERMINISM_MODE,
+    CheckpointFunction,
+    _checkpoint_without_reentrant_generator,
+    checkpoint_sequential,
+    noop_context_fn,
+)
+
+
+class ActivationManager:
+    def __init__(self):
+        self.enable = False
+        self.buffer = None
+        self.total_size = 0
+        self.avail_offset = 0
+        self.tensor_id_queue = []
+        self.ignore_tensor_id_set = set()
+
+    def setup_buffer(self, numel: int, dtype: torch.dtype):
+        self.buffer = torch.empty(numel, dtype=dtype, pin_memory=True)
+        self.total_size = numel
+        self.enable = True
+
+    def offload(self, x: torch.Tensor) -> None:
+        if not self.enable or id(x) in self.ignore_tensor_id_set:
+            return
+        size = x.numel()
+        if self.avail_offset + size > self.total_size:
+            raise RuntimeError("Activation buffer is full")
+        assert x.dtype == self.buffer.dtype, f"Wrong dtype of offload tensor"
+        cpu_x = self.buffer[self.avail_offset : self.avail_offset + size].view_as(x)
+        cpu_x.copy_(x)
+        x.data = cpu_x
+        self.avail_offset += size
+        self.tensor_id_queue.append(id(x))
+
+    def onload(self, x: torch.Tensor) -> None:
+        if not self.enable or id(x) in self.ignore_tensor_id_set:
+            return
+        assert self.tensor_id_queue[-1] == id(x), f"Wrong order of offload/onload"
+        # current x is pinned memory
+        assert x.data.is_pinned()
+        x.data = x.data.to(get_current_device(), non_blocking=True)
+        self.tensor_id_queue.pop()
+        self.avail_offset -= x.numel()
+        if len(self.tensor_id_queue) == 0:
+            self.ignore_tensor_id_set.clear()
+
+    def add_ignore_tensor(self, x: torch.Tensor) -> None:
+        self.ignore_tensor_id_set.add(id(x))
+
+    def is_top_tensor(self, x: torch.Tensor) -> bool:
+        return len(self.tensor_id_queue) > 0 and self.tensor_id_queue[-1] == id(x)
+
+
+GLOBAL_ACTIVATION_MANAGER = ActivationManager()
+
+
+class CheckpointFunctionWithOffload(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, preserve_rng_state, *args):
+        for x in args[::-1]:
+            # handle those tensors are used in multiple checkpoints
+            if GLOBAL_ACTIVATION_MANAGER.is_top_tensor(x):
+                GLOBAL_ACTIVATION_MANAGER.onload(x)
+                GLOBAL_ACTIVATION_MANAGER.add_ignore_tensor(x)
+        out = CheckpointFunction.forward(ctx, run_function, preserve_rng_state, *args)
+        for x in args:
+            if torch.is_tensor(x):
+                GLOBAL_ACTIVATION_MANAGER.offload(x)
+        return out
+
+    @staticmethod
+    def backward(ctx, *args):
+        # with stack-fashion, the last tensor is the first to be loaded
+        for tensor in ctx.saved_tensors[::-1]:
+            GLOBAL_ACTIVATION_MANAGER.onload(tensor)
+        return CheckpointFunction.backward(ctx, *args)
+
+
+# TorchDynamo does not step inside utils.checkpoint function.  The flow
+# looks likes this
+#  1) TorchDynamo tries to wrap utils.checkpoint in a HigherOrderOp by
+#     speculatively checking if the forward function is safe to trace.
+#  2) If yes, then Dynamo-generated Fx graph has the wrapped higher
+#     order op. As a result, TorchDynamo does not look inside utils.checkpoint.
+#  3) If not, then TorchDynamo falls back to eager by performing a graph
+#     break. And here, the following disable wrapper ensures that
+#     TorchDynamo does not trigger again on the frames created by
+#     utils.checkpoint innards.
+@torch._disable_dynamo
+def checkpoint(
+    function,
+    *args,
+    use_reentrant: Optional[bool] = None,
+    context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
+    determinism_check: str = _DEFAULT_DETERMINISM_MODE,
+    debug: bool = False,
+    **kwargs,
+):
+    r"""Checkpoint a model or part of the model.
+
+    Activation checkpointing is a technique that trades compute for memory.
+    Instead of keeping tensors needed for backward alive until they are used in
+    gradient computation during backward, forward computation in checkpointed
+    regions omits saving tensors for backward and recomputes them during the
+    backward pass. Activation checkpointing can be applied to any part of a
+    model.
+
+    There are currently two checkpointing implementations available, determined
+    by the :attr:`use_reentrant` parameter. It is recommended that you use
+    ``use_reentrant=False``. Please refer the note below for a discussion of
+    their differences.
+
+    .. warning::
+
+        If the :attr:`function` invocation during the backward pass differs
+        from the forward pass, e.g., due to a global variable, the checkpointed
+        version may not be equivalent, potentially causing an
+        error being raised or leading to silently incorrect gradients.
+
+    .. warning::
+
+        The ``use_reentrant`` parameter should be passed explicitly. In version
+        2.4 we will raise an exception if ``use_reentrant`` is not passed.
+        If you are using the ``use_reentrant=True`` variant, please refer to the
+        note below for important considerations and potential limitations.
+
+    .. note::
+
+        The reentrant variant of checkpoint (``use_reentrant=True``) and
+        the non-reentrant variant of checkpoint (``use_reentrant=False``)
+        differ in the following ways:
+
+        * Non-reentrant checkpoint stops recomputation as soon as all needed
+          intermediate activations have been recomputed. This feature is enabled
+          by default, but can be disabled with :func:`set_checkpoint_early_stop`.
+          Reentrant checkpoint always recomputes :attr:`function` in its
+          entirety during the backward pass.
+
+        * The reentrant variant does not record the autograd graph during the
+          forward pass, as it runs with the forward pass under
+          :func:`torch.no_grad`. The non-reentrant version does record the
+          autograd graph, allowing one to perform backward on the graph within
+          checkpointed regions.
+
+        * The reentrant checkpoint only supports the
+          :func:`torch.autograd.backward` API for the backward pass without its
+          `inputs` argument, while the non-reentrant version supports all ways
+          of performing the backward pass.
+
+        * At least one input and output must have ``requires_grad=True`` for the
+          reentrant variant. If this condition is unmet, the checkpointed part
+          of the model will not have gradients. The non-reentrant version does
+          not have this requirement.
+
+        * The reentrant version does not consider tensors in nested structures
+          (e.g., custom objects, lists, dicts, etc) as participating in
+          autograd, while the non-reentrant version does.
+
+        * The reentrant checkpoint does not support checkpointed regions with
+          detached tensors from the computational graph, whereas the
+          non-reentrant version does. For the reentrant variant, if the
+          checkpointed segment contains tensors detached using ``detach()`` or
+          with :func:`torch.no_grad`, the backward pass will raise an error.
+          This is because ``checkpoint`` makes all the outputs require gradients
+          and this causes issues when a tensor is defined to have no gradient in
+          the model. To avoid this, detach the tensors outside of the
+          ``checkpoint`` function.
+
+    Args:
+        function: describes what to run in the forward pass of the model or
+            part of the model. It should also know how to handle the inputs
+            passed as the tuple. For example, in LSTM, if user passes
+            ``(activation, hidden)``, :attr:`function` should correctly use the
+            first input as ``activation`` and the second input as ``hidden``
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint. Note that under torch.compile,
+            this flag doesn't take effect and we always preserve RNG state.
+            Default: ``True``
+        use_reentrant(bool):
+            specify whether to use the activation checkpoint variant that
+            requires reentrant autograd. This parameter should be passed
+            explicitly. In version 2.4 we will raise an exception if
+            ``use_reentrant`` is not passed. If ``use_reentrant=False``,
+            ``checkpoint`` will use an implementation that does not require
+            reentrant autograd. This allows ``checkpoint`` to support additional
+            functionality, such as working as expected with
+            ``torch.autograd.grad`` and support for keyword arguments input into
+            the checkpointed function.
+        context_fn(Callable, optional): A callable returning a tuple of two
+            context managers. The function and its recomputation will be run
+            under the first and second context managers respectively.
+            This argument is only supported if ``use_reentrant=False``.
+        determinism_check(str, optional): A string specifying the determinism
+            check to perform. By default it is set to ``"default"`` which
+            compares the shapes, dtypes, and devices of the recomputed tensors
+            against those the saved tensors. To turn off this check, specify
+            ``"none"``. Currently these are the only two supported values.
+            Please open an issue if you would like to see more determinism
+            checks. This argument is only supported if ``use_reentrant=False``,
+            if ``use_reentrant=True``, the determinism check is always disabled.
+        debug(bool, optional): If ``True``, error messages will also include
+            a trace of the operators ran during the original forward computation
+            as well as the recomputation. This argument is only supported if
+            ``use_reentrant=False``.
+        args: tuple containing inputs to the :attr:`function`
+
+    Returns:
+        Output of running :attr:`function` on :attr:`*args`
+    """
+    if use_reentrant is None:
+        warnings.warn(
+            "torch.utils.checkpoint: the use_reentrant parameter should be "
+            "passed explicitly. In version 2.4 we will raise an exception "
+            "if use_reentrant is not passed. use_reentrant=False is "
+            "recommended, but if you need to preserve the current default "
+            "behavior, you can pass use_reentrant=True. Refer to docs for more "
+            "details on the differences between the two variants.",
+            stacklevel=2,
+        )
+        use_reentrant = True
+
+    # Hack to mix *args with **kwargs in a python 2.7-compliant way
+    preserve = kwargs.pop("preserve_rng_state", True)
+    if kwargs and use_reentrant:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs))
+
+    if use_reentrant:
+        if context_fn is not noop_context_fn or debug is not False:
+            raise ValueError("Passing `context_fn` or `debug` is only supported when " "use_reentrant=False.")
+        return CheckpointFunctionWithOffload.apply(function, preserve, *args)
+    else:
+        gen = _checkpoint_without_reentrant_generator(
+            function, preserve, context_fn, determinism_check, debug, *args, **kwargs
+        )
+        # Runs pre-forward logic
+        next(gen)
+        ret = function(*args, **kwargs)
+        # Runs post-forward logic
+        try:
+            next(gen)
+        except StopIteration:
+            return ret
+
+
+def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
+    assert isinstance(model, nn.Module)
+
+    def set_attr(module):
+        module.grad_checkpointing = True
+        module.fp32_attention = use_fp32_attention
+        module.grad_checkpointing_step = gc_step
+
+    model.apply(set_attr)
+
+
+def auto_grad_checkpoint(module, *args, **kwargs):
+    if getattr(module, "grad_checkpointing", False):
+        if not isinstance(module, Iterable):
+            return checkpoint(module, *args, use_reentrant=True, **kwargs)
+        gc_step = module[0].grad_checkpointing_step
+        return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs)
+    return module(*args, **kwargs)
diff --git a/opensora/acceleration/communications.py b/opensora/acceleration/communications.py
new file mode 100644
index 0000000..d0900d2
--- /dev/null
+++ b/opensora/acceleration/communications.py
@@ -0,0 +1,188 @@
+import torch
+import torch.distributed as dist
+
+
+# ====================
+# All-To-All
+# ====================
+def _all_to_all(
+    input_: torch.Tensor,
+    world_size: int,
+    group: dist.ProcessGroup,
+    scatter_dim: int,
+    gather_dim: int,
+):
+    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
+    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+    dist.all_to_all(output_list, input_list, group=group)
+    return torch.cat(output_list, dim=gather_dim).contiguous()
+
+
+class _AllToAll(torch.autograd.Function):
+    """All-to-all communication.
+
+    Args:
+        input_: input matrix
+        process_group: communication group
+        scatter_dim: scatter dimension
+        gather_dim: gather dimension
+    """
+
+    @staticmethod
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
+        ctx.process_group = process_group
+        ctx.scatter_dim = scatter_dim
+        ctx.gather_dim = gather_dim
+        ctx.world_size = dist.get_world_size(process_group)
+        output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = _all_to_all(
+            grad_output,
+            ctx.world_size,
+            ctx.process_group,
+            ctx.gather_dim,
+            ctx.scatter_dim,
+        )
+        return (
+            grad_output,
+            None,
+            None,
+            None,
+        )
+
+
+def all_to_all(
+    input_: torch.Tensor,
+    process_group: dist.ProcessGroup,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
+
+
+def _gather(
+    input_: torch.Tensor,
+    world_size: int,
+    group: dist.ProcessGroup,
+    gather_dim: int,
+):
+    if gather_list is None:
+        gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+    dist.gather(input_, gather_list, group=group, gather_dim=gather_dim)
+    return gather_list
+
+
+# ====================
+# Gather-Split
+# ====================
+
+
+def _split(input_, pg: dist.ProcessGroup, dim=-1):
+    # skip if only one rank involved
+    world_size = dist.get_world_size(pg)
+    rank = dist.get_rank(pg)
+    if world_size == 1:
+        return input_
+
+    # Split along last dimension.
+    dim_size = input_.size(dim)
+    assert dim_size % world_size == 0, (
+        f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), "
+        f"cannot split tensor evenly"
+    )
+
+    tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
+    output = tensor_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_, pg: dist.ProcessGroup, dim=-1):
+    # skip if only one rank involved
+    input_ = input_.contiguous()
+    world_size = dist.get_world_size(pg)
+    dist.get_rank(pg)
+
+    if world_size == 1:
+        return input_
+
+    # all gather
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    assert input_.device.type == "cuda"
+    torch.distributed.all_gather(tensor_list, input_, group=pg)
+
+    # concat
+    output = torch.cat(tensor_list, dim=dim).contiguous()
+
+    return output
+
+
+class _GatherForwardSplitBackward(torch.autograd.Function):
+    """Gather the input from model parallel region and concatenate.
+
+    Args:
+        input_: input matrix.
+        process_group: parallel mode.
+        dim: dimension
+    """
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def forward(ctx, input_, process_group, dim, grad_scale):
+        ctx.mode = process_group
+        ctx.dim = dim
+        ctx.grad_scale = grad_scale
+        return _gather(input_, process_group, dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.grad_scale == "up":
+            grad_output = grad_output * dist.get_world_size(ctx.mode)
+        elif ctx.grad_scale == "down":
+            grad_output = grad_output / dist.get_world_size(ctx.mode)
+
+        return _split(grad_output, ctx.mode, ctx.dim), None, None, None
+
+
+class _SplitForwardGatherBackward(torch.autograd.Function):
+    """
+    Split the input and keep only the corresponding chuck to the rank.
+
+    Args:
+        input_: input matrix.
+        process_group: parallel mode.
+        dim: dimension
+    """
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split(input_)
+
+    @staticmethod
+    def forward(ctx, input_, process_group, dim, grad_scale):
+        ctx.mode = process_group
+        ctx.dim = dim
+        ctx.grad_scale = grad_scale
+        return _split(input_, process_group, dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.grad_scale == "up":
+            grad_output = grad_output * dist.get_world_size(ctx.mode)
+        elif ctx.grad_scale == "down":
+            grad_output = grad_output / dist.get_world_size(ctx.mode)
+        return _gather(grad_output, ctx.mode, ctx.dim), None, None, None
+
+
+def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0):
+    return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale)
+
+
+def gather_forward_split_backward(input_, process_group, dim, grad_scale=None):
+    return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale)
diff --git a/opensora/acceleration/parallel_states.py b/opensora/acceleration/parallel_states.py
new file mode 100644
index 0000000..ea27636
--- /dev/null
+++ b/opensora/acceleration/parallel_states.py
@@ -0,0 +1,29 @@
+import torch.distributed as dist
+
+_GLOBAL_PARALLEL_GROUPS = dict()
+
+
+def set_data_parallel_group(group: dist.ProcessGroup):
+    _GLOBAL_PARALLEL_GROUPS["data"] = group
+
+
+def get_data_parallel_group(get_mixed_dp_pg : bool = False):
+    if get_mixed_dp_pg and "mixed_dp_group" in _GLOBAL_PARALLEL_GROUPS:
+        return _GLOBAL_PARALLEL_GROUPS["mixed_dp_group"]
+    return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD)
+
+
+def set_sequence_parallel_group(group: dist.ProcessGroup):
+    _GLOBAL_PARALLEL_GROUPS["sequence"] = group
+
+
+def get_sequence_parallel_group():
+    return _GLOBAL_PARALLEL_GROUPS.get("sequence", None)
+
+
+def set_tensor_parallel_group(group: dist.ProcessGroup):
+    _GLOBAL_PARALLEL_GROUPS["tensor"] = group
+
+
+def get_tensor_parallel_group():
+    return _GLOBAL_PARALLEL_GROUPS.get("tensor", None)
diff --git a/opensora/acceleration/shardformer/__init__.py b/opensora/acceleration/shardformer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/opensora/acceleration/shardformer/modeling/__init__.py b/opensora/acceleration/shardformer/modeling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/opensora/acceleration/shardformer/modeling/t5.py b/opensora/acceleration/shardformer/modeling/t5.py
new file mode 100644
index 0000000..9cfb808
--- /dev/null
+++ b/opensora/acceleration/shardformer/modeling/t5.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+    @staticmethod
+    def from_native_module(module, *args, **kwargs):
+        assert module.__class__.__name__ == "FusedRMSNorm", (
+            "Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm."
+            "Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48"
+        )
+
+        layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps)
+        layer_norm.weight.data.copy_(module.weight.data)
+        layer_norm = layer_norm.to(module.weight.device)
+        return layer_norm
diff --git a/opensora/acceleration/shardformer/policy/__init__.py b/opensora/acceleration/shardformer/policy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/opensora/acceleration/shardformer/policy/t5_encoder.py b/opensora/acceleration/shardformer/policy/t5_encoder.py
new file mode 100644
index 0000000..f69d361
--- /dev/null
+++ b/opensora/acceleration/shardformer/policy/t5_encoder.py
@@ -0,0 +1,41 @@
+from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func
+from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward
+from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription
+
+
+class T5EncoderPolicy(Policy):
+    def config_sanity_check(self):
+        assert not self.shard_config.enable_tensor_parallelism
+        assert not self.shard_config.enable_flash_attention
+
+    def preprocess(self):
+        return self.model
+
+    def module_policy(self):
+        from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack
+
+        policy = {}
+
+        # use jit operator
+        if self.shard_config.enable_jit_fused:
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_jit_fused_T5_layer_ff_forward(),
+                    "dropout_add": get_jit_fused_dropout_add_func(),
+                },
+                policy=policy,
+                target_key=T5LayerFF,
+            )
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_T5_layer_self_attention_forward(),
+                    "dropout_add": get_jit_fused_dropout_add_func(),
+                },
+                policy=policy,
+                target_key=T5LayerSelfAttention,
+            )
+
+        return policy
+
+    def postprocess(self):
+        return self.model
diff --git a/opensora/models/__init__.py b/opensora/models/__init__.py
new file mode 100644
index 0000000..152e816
--- /dev/null
+++ b/opensora/models/__init__.py
@@ -0,0 +1,5 @@
+from .dc_ae import *
+from .hunyuan_vae import *
+from .mmdit import *
+from .text import *
+from .vae import *
diff --git a/opensora/models/dc_ae/__init__.py b/opensora/models/dc_ae/__init__.py
new file mode 100644
index 0000000..1a4513e
--- /dev/null
+++ b/opensora/models/dc_ae/__init__.py
@@ -0,0 +1 @@
+from .ae_model_zoo import DC_AE
diff --git a/opensora/models/dc_ae/ae_model_zoo.py b/opensora/models/dc_ae/ae_model_zoo.py
new file mode 100644
index 0000000..5c8e794
--- /dev/null
+++ b/opensora/models/dc_ae/ae_model_zoo.py
@@ -0,0 +1,85 @@
+# Copyright 2024 MIT Han Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Optional
+
+import diffusers
+import torch
+from huggingface_hub import PyTorchModelHubMixin
+from torch import nn
+
+from opensora.registry import MODELS
+from opensora.utils.ckpt import load_checkpoint
+
+from .models.dc_ae import DCAE, DCAEConfig, dc_ae_f32
+
+__all__ = ["create_dc_ae_model_cfg", "DCAE_HF", "DC_AE"]
+
+
+REGISTERED_DCAE_MODEL: dict[str, tuple[Callable, Optional[str]]] = {
+    "dc-ae-f32t4c128": (dc_ae_f32, None),
+}
+
+
+def create_dc_ae_model_cfg(name: str, pretrained_path: Optional[str] = None) -> DCAEConfig:
+    assert name in REGISTERED_DCAE_MODEL, f"{name} is not supported"
+    dc_ae_cls, default_pt_path = REGISTERED_DCAE_MODEL[name]
+    pretrained_path = default_pt_path if pretrained_path is None else pretrained_path
+    model_cfg = dc_ae_cls(name, pretrained_path)
+    return model_cfg
+
+
+class DCAE_HF(DCAE, PyTorchModelHubMixin):
+    def __init__(self, model_name: str):
+        cfg = create_dc_ae_model_cfg(model_name)
+        DCAE.__init__(self, cfg)
+
+
+@MODELS.register_module("dc_ae")
+def DC_AE(
+    model_name: str,
+    device_map: str | torch.device = "cuda",
+    torch_dtype: torch.dtype = torch.bfloat16,
+    from_scratch: bool = False,
+    from_pretrained: str | None = None,
+    is_training: bool = False,
+    use_spatial_tiling: bool = False,
+    use_temporal_tiling: bool = False,
+    spatial_tile_size: int = 256,
+    temporal_tile_size: int = 32,
+    tile_overlap_factor: float = 0.25,
+    scaling_factor: float = None,
+    disc_off_grad_ckpt: bool = False,
+) -> DCAE_HF:
+    if not from_scratch:
+        model = DCAE_HF.from_pretrained(model_name).to(device_map, torch_dtype)
+    else:
+        model = DCAE_HF(model_name).to(device_map, torch_dtype)
+
+    if from_pretrained is not None:
+        model = load_checkpoint(model, from_pretrained, device_map=device_map)
+        print(f"loaded dc_ae from ckpt path: {from_pretrained}")
+
+    model.cfg.is_training = is_training
+    model.use_spatial_tiling = use_spatial_tiling
+    model.use_temporal_tiling = use_temporal_tiling
+    model.spatial_tile_size = spatial_tile_size
+    model.temporal_tile_size = temporal_tile_size
+    model.tile_overlap_factor = tile_overlap_factor
+    if scaling_factor is not None:
+        model.scaling_factor = scaling_factor
+    model.decoder.disc_off_grad_ckpt = disc_off_grad_ckpt
+    return model
\ No newline at end of file
diff --git a/opensora/models/dc_ae/models/__init__.py b/opensora/models/dc_ae/models/__init__.py
new file mode 100644
index 0000000..ce6455c
--- /dev/null
+++ b/opensora/models/dc_ae/models/__init__.py
@@ -0,0 +1 @@
+from .dc_ae import *
diff --git a/opensora/models/dc_ae/models/dc_ae.py b/opensora/models/dc_ae/models/dc_ae.py
new file mode 100644
index 0000000..c1bd81d
--- /dev/null
+++ b/opensora/models/dc_ae/models/dc_ae.py
@@ -0,0 +1,815 @@
+# Copyright 2024 MIT Han Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+from omegaconf import MISSING, OmegaConf
+from torch import Tensor
+
+from opensora.acceleration.checkpoint import auto_grad_checkpoint
+
+from ..utils import init_modules
+from .nn.act import build_act
+from .nn.norm import build_norm
+from .nn.ops import (
+    ChannelDuplicatingPixelShuffleUpSampleLayer,
+    ConvLayer,
+    ConvPixelShuffleUpSampleLayer,
+    ConvPixelUnshuffleDownSampleLayer,
+    EfficientViTBlock,
+    IdentityLayer,
+    InterpolateConvUpSampleLayer,
+    OpSequential,
+    PixelUnshuffleChannelAveragingDownSampleLayer,
+    ResBlock,
+    ResidualBlock,
+)
+
+__all__ = ["DCAE", "dc_ae_f32"]
+
+
+@dataclass
+class EncoderConfig:
+    in_channels: int = MISSING
+    latent_channels: int = MISSING
+    width_list: tuple[int, ...] = (128, 256, 512, 512, 1024, 1024)
+    depth_list: tuple[int, ...] = (2, 2, 2, 2, 2, 2)
+    block_type: Any = "ResBlock"
+    norm: str = "rms2d"
+    act: str = "silu"
+    downsample_block_type: str = "ConvPixelUnshuffle"
+    downsample_match_channel: bool = True
+    downsample_shortcut: Optional[str] = "averaging"
+    out_norm: Optional[str] = None
+    out_act: Optional[str] = None
+    out_shortcut: Optional[str] = "averaging"
+    double_latent: bool = False
+    is_video: bool = False
+    temporal_downsample: tuple[bool, ...] = ()
+
+
+@dataclass
+class DecoderConfig:
+    in_channels: int = MISSING
+    latent_channels: int = MISSING
+    in_shortcut: Optional[str] = "duplicating"
+    width_list: tuple[int, ...] = (128, 256, 512, 512, 1024, 1024)
+    depth_list: tuple[int, ...] = (2, 2, 2, 2, 2, 2)
+    block_type: Any = "ResBlock"
+    norm: Any = "rms2d"
+    act: Any = "silu"
+    upsample_block_type: str = "ConvPixelShuffle"
+    upsample_match_channel: bool = True
+    upsample_shortcut: str = "duplicating"
+    out_norm: str = "rms2d"
+    out_act: str = "relu"
+    is_video: bool = False
+    temporal_upsample: tuple[bool, ...] = ()
+
+
+@dataclass
+class DCAEConfig:
+    in_channels: int = 3
+    latent_channels: int = 32
+    time_compression_ratio: int = 1
+    spatial_compression_ratio: int = 32
+    encoder: EncoderConfig = field(
+        default_factory=lambda: EncoderConfig(in_channels="${..in_channels}", latent_channels="${..latent_channels}")
+    )
+    decoder: DecoderConfig = field(
+        default_factory=lambda: DecoderConfig(in_channels="${..in_channels}", latent_channels="${..latent_channels}")
+    )
+    use_quant_conv: bool = False
+
+    pretrained_path: Optional[str] = None
+    pretrained_source: str = "dc-ae"
+
+    scaling_factor: Optional[float] = None
+    is_image_model: bool = False
+
+    is_training: bool = False  # NOTE: set to True in vae train config
+
+    use_spatial_tiling: bool = False
+    use_temporal_tiling: bool = False
+    spatial_tile_size: int = 256
+    temporal_tile_size: int = 32
+    tile_overlap_factor: float = 0.25
+    
+
+
+def build_block(
+    block_type: str, in_channels: int, out_channels: int, norm: Optional[str], act: Optional[str], is_video: bool
+) -> nn.Module:
+    if block_type == "ResBlock":
+        assert in_channels == out_channels
+        main_block = ResBlock(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            use_bias=(True, False),
+            norm=(None, norm),
+            act_func=(act, None),
+            is_video=is_video,
+        )
+        block = ResidualBlock(main_block, IdentityLayer())
+    elif block_type == "EViT_GLU":
+        assert in_channels == out_channels
+        block = EfficientViTBlock(
+            in_channels, norm=norm, act_func=act, local_module="GLUMBConv", scales=(), is_video=is_video
+        )
+    elif block_type == "EViTS5_GLU":
+        assert in_channels == out_channels
+        block = EfficientViTBlock(
+            in_channels, norm=norm, act_func=act, local_module="GLUMBConv", scales=(5,), is_video=is_video
+        )
+    else:
+        raise ValueError(f"block_type {block_type} is not supported")
+    return block
+
+
+def build_stage_main(
+    width: int, depth: int, block_type: str | list[str], norm: str, act: str, input_width: int, is_video: bool
+) -> list[nn.Module]:
+    assert isinstance(block_type, str) or (isinstance(block_type, list) and depth == len(block_type))
+    stage = []
+    for d in range(depth):
+        current_block_type = block_type[d] if isinstance(block_type, list) else block_type
+        block = build_block(
+            block_type=current_block_type,
+            in_channels=width if d > 0 else input_width,
+            out_channels=width,
+            norm=norm,
+            act=act,
+            is_video=is_video,
+        )
+        stage.append(block)
+    return stage
+
+
+def build_downsample_block(
+    block_type: str,
+    in_channels: int,
+    out_channels: int,
+    shortcut: Optional[str],
+    is_video: bool,
+    temporal_downsample: bool = False,
+) -> nn.Module:
+    """
+    Spatial downsample is always performed. Temporal downsample is optional.
+    """
+
+    if block_type == "Conv":
+        if is_video:
+            if temporal_downsample:
+                stride = (2, 2, 2)
+            else:
+                stride = (1, 2, 2)
+        else:
+            stride = 2
+        block = ConvLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            use_bias=True,
+            norm=None,
+            act_func=None,
+            is_video=is_video,
+        )
+    elif block_type == "ConvPixelUnshuffle":
+        if is_video:
+            raise NotImplementedError("ConvPixelUnshuffle downsample is not supported for video")
+        block = ConvPixelUnshuffleDownSampleLayer(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=3, factor=2
+        )
+    else:
+        raise ValueError(f"block_type {block_type} is not supported for downsampling")
+    if shortcut is None:
+        pass
+    elif shortcut == "averaging":
+        shortcut_block = PixelUnshuffleChannelAveragingDownSampleLayer(
+            in_channels=in_channels, out_channels=out_channels, factor=2, temporal_downsample=temporal_downsample
+        )
+        block = ResidualBlock(block, shortcut_block)
+    else:
+        raise ValueError(f"shortcut {shortcut} is not supported for downsample")
+    return block
+
+
+def build_upsample_block(
+    block_type: str,
+    in_channels: int,
+    out_channels: int,
+    shortcut: Optional[str],
+    is_video: bool,
+    temporal_upsample: bool = False,
+) -> nn.Module:
+    if block_type == "ConvPixelShuffle":
+        if is_video:
+            raise NotImplementedError("ConvPixelShuffle upsample is not supported for video")
+        block = ConvPixelShuffleUpSampleLayer(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=3, factor=2
+        )
+    elif block_type == "InterpolateConv":
+        block = InterpolateConvUpSampleLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            factor=2,
+            is_video=is_video,
+            temporal_upsample=temporal_upsample,
+        )
+    else:
+        raise ValueError(f"block_type {block_type} is not supported for upsampling")
+    if shortcut is None:
+        pass
+    elif shortcut == "duplicating":
+        shortcut_block = ChannelDuplicatingPixelShuffleUpSampleLayer(
+            in_channels=in_channels, out_channels=out_channels, factor=2, temporal_upsample=temporal_upsample
+        )
+        block = ResidualBlock(block, shortcut_block)
+    else:
+        raise ValueError(f"shortcut {shortcut} is not supported for upsample")
+    return block
+
+
+def build_encoder_project_in_block(
+    in_channels: int, out_channels: int, factor: int, downsample_block_type: str, is_video: bool
+):
+    if factor == 1:
+        block = ConvLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            use_bias=True,
+            norm=None,
+            act_func=None,
+            is_video=is_video,
+        )
+    elif factor == 2:
+        if is_video:
+            raise NotImplementedError("Downsample during project_in is not supported for video")
+        block = build_downsample_block(
+            block_type=downsample_block_type, in_channels=in_channels, out_channels=out_channels, shortcut=None
+        )
+    else:
+        raise ValueError(f"downsample factor {factor} is not supported for encoder project in")
+    return block
+
+
+def build_encoder_project_out_block(
+    in_channels: int,
+    out_channels: int,
+    norm: Optional[str],
+    act: Optional[str],
+    shortcut: Optional[str],
+    is_video: bool,
+):
+    block = OpSequential(
+        [
+            build_norm(norm),
+            build_act(act),
+            ConvLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=1,
+                use_bias=True,
+                norm=None,
+                act_func=None,
+                is_video=is_video,
+            ),
+        ]
+    )
+    if shortcut is None:
+        pass
+    elif shortcut == "averaging":
+        shortcut_block = PixelUnshuffleChannelAveragingDownSampleLayer(
+            in_channels=in_channels, out_channels=out_channels, factor=1
+        )
+        block = ResidualBlock(block, shortcut_block)
+    else:
+        raise ValueError(f"shortcut {shortcut} is not supported for encoder project out")
+    return block
+
+
+def build_decoder_project_in_block(in_channels: int, out_channels: int, shortcut: Optional[str], is_video: bool):
+    block = ConvLayer(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=3,
+        stride=1,
+        use_bias=True,
+        norm=None,
+        act_func=None,
+        is_video=is_video,
+    )
+    if shortcut is None:
+        pass
+    elif shortcut == "duplicating":
+        shortcut_block = ChannelDuplicatingPixelShuffleUpSampleLayer(
+            in_channels=in_channels, out_channels=out_channels, factor=1
+        )
+        block = ResidualBlock(block, shortcut_block)
+    else:
+        raise ValueError(f"shortcut {shortcut} is not supported for decoder project in")
+    return block
+
+
+def build_decoder_project_out_block(
+    in_channels: int,
+    out_channels: int,
+    factor: int,
+    upsample_block_type: str,
+    norm: Optional[str],
+    act: Optional[str],
+    is_video: bool,
+):
+    layers: list[nn.Module] = [
+        build_norm(norm, in_channels),
+        build_act(act),
+    ]
+    if factor == 1:
+        layers.append(
+            ConvLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=1,
+                use_bias=True,
+                norm=None,
+                act_func=None,
+                is_video=is_video,
+            )
+        )
+    elif factor == 2:
+        if is_video:
+            raise NotImplementedError("Upsample during project_out is not supported for video")
+        layers.append(
+            build_upsample_block(
+                block_type=upsample_block_type, in_channels=in_channels, out_channels=out_channels, shortcut=None
+            )
+        )
+    else:
+        raise ValueError(f"upsample factor {factor} is not supported for decoder project out")
+    return OpSequential(layers)
+
+
+class Encoder(nn.Module):
+    def __init__(self, cfg: EncoderConfig):
+        super().__init__()
+        self.cfg = cfg
+        num_stages = len(cfg.width_list)
+        self.num_stages = num_stages
+        assert len(cfg.depth_list) == num_stages
+        assert len(cfg.width_list) == num_stages
+        assert isinstance(cfg.block_type, str) or (
+            isinstance(cfg.block_type, list) and len(cfg.block_type) == num_stages
+        )
+
+        self.project_in = build_encoder_project_in_block(
+            in_channels=cfg.in_channels,
+            out_channels=cfg.width_list[0] if cfg.depth_list[0] > 0 else cfg.width_list[1],
+            factor=1 if cfg.depth_list[0] > 0 else 2,
+            downsample_block_type=cfg.downsample_block_type,
+            is_video=cfg.is_video,
+        )
+
+        self.stages: list[OpSequential] = []
+        for stage_id, (width, depth) in enumerate(zip(cfg.width_list, cfg.depth_list)):
+            block_type = cfg.block_type[stage_id] if isinstance(cfg.block_type, list) else cfg.block_type
+            stage = build_stage_main(
+                width=width,
+                depth=depth,
+                block_type=block_type,
+                norm=cfg.norm,
+                act=cfg.act,
+                input_width=width,
+                is_video=cfg.is_video,
+            )
+
+            if stage_id < num_stages - 1 and depth > 0:
+                downsample_block = build_downsample_block(
+                    block_type=cfg.downsample_block_type,
+                    in_channels=width,
+                    out_channels=cfg.width_list[stage_id + 1] if cfg.downsample_match_channel else width,
+                    shortcut=cfg.downsample_shortcut,
+                    is_video=cfg.is_video,
+                    temporal_downsample=cfg.temporal_downsample[stage_id] if cfg.temporal_downsample != [] else False,
+                )
+                stage.append(downsample_block)
+            self.stages.append(OpSequential(stage))
+        self.stages = nn.ModuleList(self.stages)
+
+        self.project_out = build_encoder_project_out_block(
+            in_channels=cfg.width_list[-1],
+            out_channels=2 * cfg.latent_channels if cfg.double_latent else cfg.latent_channels,
+            norm=cfg.out_norm,
+            act=cfg.out_act,
+            shortcut=cfg.out_shortcut,
+            is_video=cfg.is_video,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.project_in(x)
+        # x = auto_grad_checkpoint(self.project_in, x)
+        for stage in self.stages:
+            if len(stage.op_list) == 0:
+                continue
+            x = auto_grad_checkpoint(stage, x)
+        # x = self.project_out(x)
+        x = auto_grad_checkpoint(self.project_out, x)
+        return x
+
+
+class Decoder(nn.Module):
+    def __init__(self, cfg: DecoderConfig):
+        super().__init__()
+        self.cfg = cfg
+        num_stages = len(cfg.width_list)
+        self.num_stages = num_stages
+        assert len(cfg.depth_list) == num_stages
+        assert len(cfg.width_list) == num_stages
+        assert isinstance(cfg.block_type, str) or (
+            isinstance(cfg.block_type, list) and len(cfg.block_type) == num_stages
+        )
+        assert isinstance(cfg.norm, str) or (isinstance(cfg.norm, list) and len(cfg.norm) == num_stages)
+        assert isinstance(cfg.act, str) or (isinstance(cfg.act, list) and len(cfg.act) == num_stages)
+
+        self.project_in = build_decoder_project_in_block(
+            in_channels=cfg.latent_channels,
+            out_channels=cfg.width_list[-1],
+            shortcut=cfg.in_shortcut,
+            is_video=cfg.is_video,
+        )
+
+        self.stages: list[OpSequential] = []
+        for stage_id, (width, depth) in reversed(list(enumerate(zip(cfg.width_list, cfg.depth_list)))):
+            stage = []
+            if stage_id < num_stages - 1 and depth > 0:
+                upsample_block = build_upsample_block(
+                    block_type=cfg.upsample_block_type,
+                    in_channels=cfg.width_list[stage_id + 1],
+                    out_channels=width if cfg.upsample_match_channel else cfg.width_list[stage_id + 1],
+                    shortcut=cfg.upsample_shortcut,
+                    is_video=cfg.is_video,
+                    temporal_upsample=cfg.temporal_upsample[stage_id] if cfg.temporal_upsample != [] else False,
+                )
+                stage.append(upsample_block)
+
+            block_type = cfg.block_type[stage_id] if isinstance(cfg.block_type, list) else cfg.block_type
+            norm = cfg.norm[stage_id] if isinstance(cfg.norm, list) else cfg.norm
+            act = cfg.act[stage_id] if isinstance(cfg.act, list) else cfg.act
+            stage.extend(
+                build_stage_main(
+                    width=width,
+                    depth=depth,
+                    block_type=block_type,
+                    norm=norm,
+                    act=act,
+                    input_width=(
+                        width if cfg.upsample_match_channel else cfg.width_list[min(stage_id + 1, num_stages - 1)]
+                    ),
+                    is_video=cfg.is_video,
+                )
+            )
+            self.stages.insert(0, OpSequential(stage))
+        self.stages = nn.ModuleList(self.stages)
+
+        self.project_out = build_decoder_project_out_block(
+            in_channels=cfg.width_list[0] if cfg.depth_list[0] > 0 else cfg.width_list[1],
+            out_channels=cfg.in_channels,
+            factor=1 if cfg.depth_list[0] > 0 else 2,
+            upsample_block_type=cfg.upsample_block_type,
+            norm=cfg.out_norm,
+            act=cfg.out_act,
+            is_video=cfg.is_video,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = auto_grad_checkpoint(self.project_in, x)
+        for stage in reversed(self.stages):
+            if len(stage.op_list) == 0:
+                continue
+            # x = stage(x)
+            x = auto_grad_checkpoint(stage, x)
+
+        if self.disc_off_grad_ckpt:
+            x = self.project_out(x)
+        else:
+            x = auto_grad_checkpoint(self.project_out, x)
+        return x
+
+
+class DCAE(nn.Module):
+    def __init__(self, cfg: DCAEConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.encoder = Encoder(cfg.encoder)
+        self.decoder = Decoder(cfg.decoder)
+        self.scaling_factor = cfg.scaling_factor
+        self.time_compression_ratio = cfg.time_compression_ratio
+        self.spatial_compression_ratio = cfg.spatial_compression_ratio
+        self.use_spatial_tiling = cfg.use_spatial_tiling
+        self.use_temporal_tiling = cfg.use_temporal_tiling
+        self.spatial_tile_size = cfg.spatial_tile_size
+        self.temporal_tile_size = cfg.temporal_tile_size
+        assert (
+            cfg.spatial_tile_size // cfg.spatial_compression_ratio
+        ), f"spatial tile size {cfg.spatial_tile_size} must be divisible by spatial compression of {cfg.spatial_compression_ratio}"
+        self.spatial_tile_latent_size = cfg.spatial_tile_size // cfg.spatial_compression_ratio
+        assert (
+            cfg.temporal_tile_size // cfg.time_compression_ratio
+        ), f"temporal tile size {cfg.temporal_tile_size} must be divisible by temporal compression of {cfg.time_compression_ratio}"
+        self.temporal_tile_latent_size = cfg.temporal_tile_size // cfg.time_compression_ratio
+        self.tile_overlap_factor = cfg.tile_overlap_factor
+        if self.cfg.pretrained_path is not None:
+            self.load_model()
+
+        self.to(torch.float32)
+        init_modules(self, init_type="trunc_normal")
+
+    def load_model(self):
+        if self.cfg.pretrained_source == "dc-ae":
+            state_dict = torch.load(self.cfg.pretrained_path, map_location="cpu", weights_only=True)["state_dict"]
+            self.load_state_dict(state_dict)
+        else:
+            raise NotImplementedError
+
+    def get_last_layer(self):
+        return self.decoder.project_out.op_list[2].conv.weight
+
+    # @property
+    # def spatial_compression_ratio(self) -> int:
+    #     return 2 ** (self.decoder.num_stages - 1)
+
+    def encode_single(self, x: torch.Tensor, is_video_encoder: bool = False) -> torch.Tensor:
+        assert x.shape[0] == 1
+        is_video = x.dim() == 5
+        if is_video and not is_video_encoder:
+            b, c, f, h, w = x.shape
+            x = x.permute(0, 2, 1, 3, 4).reshape(-1, c, h, w)
+        z = self.encoder(x)
+
+        if is_video and not is_video_encoder:
+            z = z.unsqueeze(dim=0).permute(0, 2, 1, 3, 4)
+
+        if self.scaling_factor is not None:
+            z = z / self.scaling_factor
+
+        return z
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        if self.cfg.is_training:
+            return self.encoder(x)
+        is_video_encoder = self.encoder.cfg.is_video if self.encoder.cfg.is_video is not None else False
+        x_ret = []
+        for i in range(x.shape[0]):
+            x_ret.append(self.encode_single(x[i : i + 1], is_video_encoder))
+        return torch.cat(x_ret, dim=0)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
+                x / blend_extent
+            )
+        return b
+
+    def spatial_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        net_size = int(self.spatial_tile_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.spatial_tile_latent_size * self.tile_overlap_factor)
+        row_limit = self.spatial_tile_latent_size - blend_extent
+
+        # Split video into tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[-2], net_size):
+            row = []
+            for j in range(0, x.shape[-1], net_size):
+                tile = x[:, :, :, i : i + self.spatial_tile_size, j : j + self.spatial_tile_size]
+                tile = self._encode(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        return torch.cat(result_rows, dim=-2)
+
+    def temporal_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        overlap_size = int(self.temporal_tile_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.temporal_tile_latent_size * self.tile_overlap_factor)
+        t_limit = self.temporal_tile_latent_size - blend_extent
+
+        # Split the video into tiles and encode them separately.
+        row = []
+        for i in range(0, x.shape[2], overlap_size):
+            tile = x[:, :, i : i + self.temporal_tile_size, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.spatial_tile_size or tile.shape[-2] > self.spatial_tile_size
+            ):
+                tile = self.spatial_tiled_encode(tile)
+            else:
+                tile = self._encode(tile)
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+
+        return torch.cat(result_row, dim=2)
+
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_temporal_tiling and x.shape[2] > self.temporal_tile_size:
+            return self.temporal_tiled_encode(x)
+        elif self.use_spatial_tiling and (x.shape[-1] > self.spatial_tile_size or x.shape[-2] > self.spatial_tile_size):
+            return self.spatial_tiled_encode(x)
+        else:
+            return self._encode(x)
+
+    def spatial_tiled_decode(self, z: torch.FloatTensor) -> torch.Tensor:
+        net_size = int(self.spatial_tile_latent_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.spatial_tile_size * self.tile_overlap_factor)
+        row_limit = self.spatial_tile_size - blend_extent
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[-2], net_size):
+            row = []
+            for j in range(0, z.shape[-1], net_size):
+                tile = z[:, :, :, i : i + self.spatial_tile_latent_size, j : j + self.spatial_tile_latent_size]
+                decoded = self._decode(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        return torch.cat(result_rows, dim=-2)
+
+    def temporal_tiled_decode(self, z: torch.Tensor) -> torch.Tensor:
+        overlap_size = int(self.temporal_tile_latent_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.temporal_tile_size * self.tile_overlap_factor)
+        t_limit = self.temporal_tile_size - blend_extent
+
+        row = []
+        for i in range(0, z.shape[2], overlap_size):
+            tile = z[:, :, i : i + self.temporal_tile_latent_size, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.spatial_tile_latent_size or tile.shape[-2] > self.spatial_tile_latent_size
+            ):
+                decoded = self.spatial_tiled_decode(tile)
+            else:
+                decoded = self._decode(tile)
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+            result_row.append(tile[:, :, :t_limit, :, :])
+
+        return torch.cat(result_row, dim=2)
+
+    def decode_single(self, z: torch.Tensor, is_video_decoder: bool = False) -> torch.Tensor:
+        assert z.shape[0] == 1
+        is_video = z.dim() == 5
+        if is_video and not is_video_decoder:
+            b, c, f, h, w = z.shape
+            z = z.permute(0, 2, 1, 3, 4).reshape(-1, c, h, w)
+        if self.scaling_factor is not None:
+            z = z * self.scaling_factor
+
+        x = self.decoder(z)
+
+        if is_video and not is_video_decoder:
+            x = x.unsqueeze(dim=0).permute(0, 2, 1, 3, 4)
+        return x
+
+    def _decode(self, z: torch.Tensor) -> torch.Tensor:
+        if self.cfg.is_training:
+            return self.decoder(z)
+        is_video_decoder = self.decoder.cfg.is_video if self.decoder.cfg.is_video is not None else False
+        x_ret = []
+        for i in range(z.shape[0]):
+            x_ret.append(self.decode_single(z[i : i + 1], is_video_decoder))
+        return torch.cat(x_ret, dim=0)
+
+    def decode(self, z: torch.Tensor) -> torch.Tensor:
+        if self.use_temporal_tiling and z.shape[2] > self.temporal_tile_latent_size:
+            return self.temporal_tiled_decode(z)
+        elif self.use_spatial_tiling and (
+            z.shape[-1] > self.spatial_tile_latent_size or z.shape[-2] > self.spatial_tile_latent_size
+        ):
+            return self.spatial_tiled_decode(z)
+        else:
+            return self._decode(z)
+
+    def forward(self, x: torch.Tensor) -> tuple[Any, Tensor, dict[Any, Any]]:
+        x_type = x.dtype
+        is_image_model = self.cfg.__dict__.get("is_image_model", False)
+        x = x.to(self.encoder.project_in.conv.weight.dtype)
+
+        if is_image_model:
+            b, c, _, h, w = x.shape
+            x = x.permute(0, 2, 1, 3, 4).reshape(-1, c, h, w)
+
+        z = self.encode(x)
+        dec = self.decode(z)
+
+        if is_image_model:
+            dec = dec.reshape(b, 1, c, h, w).permute(0, 2, 1, 3, 4)
+            z = z.unsqueeze(dim=0).permute(0, 2, 1, 3, 4)
+
+        dec = dec.to(x_type)
+        return dec, None, z
+
+    def get_latent_size(self, input_size: list[int]) -> list[int]:
+        latent_size = []
+        # T
+        latent_size.append((input_size[0] - 1) // self.time_compression_ratio + 1)
+        # H, w
+        for i in range(1, 3):
+            latent_size.append((input_size[i] - 1) // self.spatial_compression_ratio + 1)
+        return latent_size
+
+
+def dc_ae_f32(name: str, pretrained_path: str) -> DCAEConfig:
+    if name in ["dc-ae-f32t4c128"]:
+        cfg_str = (
+            "time_compression_ratio=4 "
+            "spatial_compression_ratio=32 "
+            "encoder.block_type=[ResBlock,ResBlock,ResBlock,EViTS5_GLU,EViTS5_GLU,EViTS5_GLU] "
+            "encoder.width_list=[128,256,512,512,1024,1024] encoder.depth_list=[2,2,2,3,3,3] "
+            "encoder.downsample_block_type=Conv "
+            "encoder.norm=rms3d "
+            "encoder.is_video=True "
+            "decoder.block_type=[ResBlock,ResBlock,ResBlock,EViTS5_GLU,EViTS5_GLU,EViTS5_GLU] "
+            "decoder.width_list=[128,256,512,512,1024,1024] decoder.depth_list=[3,3,3,3,3,3] "
+            "decoder.upsample_block_type=InterpolateConv "
+            "decoder.norm=rms3d decoder.act=silu decoder.out_norm=rms3d "
+            "decoder.is_video=True "
+            "encoder.temporal_downsample=[False,False,False,True,True,False] "
+            "decoder.temporal_upsample=[False,False,False,True,True,False] "
+            "latent_channels=128"
+        )  # make sure there is no trailing blankspace in the last line
+    else:
+        raise NotImplementedError
+    cfg = OmegaConf.from_dotlist(cfg_str.split(" "))
+    cfg: DCAEConfig = OmegaConf.to_object(OmegaConf.merge(OmegaConf.structured(DCAEConfig), cfg))
+    cfg.pretrained_path = pretrained_path
+    return cfg
+
diff --git a/opensora/models/dc_ae/models/nn/__init__.py b/opensora/models/dc_ae/models/nn/__init__.py
new file mode 100644
index 0000000..96c2e7b
--- /dev/null
+++ b/opensora/models/dc_ae/models/nn/__init__.py
@@ -0,0 +1,3 @@
+from .act import *
+from .norm import *
+from .ops import *
diff --git a/opensora/models/dc_ae/models/nn/act.py b/opensora/models/dc_ae/models/nn/act.py
new file mode 100644
index 0000000..c62db95
--- /dev/null
+++ b/opensora/models/dc_ae/models/nn/act.py
@@ -0,0 +1,44 @@
+# Copyright 2024 MIT Han Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from functools import partial
+from typing import Optional
+
+import torch.nn as nn
+
+from ..nn.vo_ops import build_kwargs_from_config
+
+
+__all__ = ["build_act"]
+
+
+# register activation function here
+REGISTERED_ACT_DICT: dict[str, type] = {
+    "relu": nn.ReLU,
+    "relu6": nn.ReLU6,
+    "hswish": nn.Hardswish,
+    "silu": nn.SiLU,
+    "gelu": partial(nn.GELU, approximate="tanh"),
+}
+
+
+def build_act(name: str, **kwargs) -> Optional[nn.Module]:
+    if name in REGISTERED_ACT_DICT:
+        act_cls = REGISTERED_ACT_DICT[name]
+        args = build_kwargs_from_config(kwargs, act_cls)
+        return act_cls(**args)
+    else:
+        return None
diff --git a/opensora/models/dc_ae/models/nn/norm.py b/opensora/models/dc_ae/models/nn/norm.py
new file mode 100644
index 0000000..5713492
--- /dev/null
+++ b/opensora/models/dc_ae/models/nn/norm.py
@@ -0,0 +1,98 @@
+# Copyright 2024 MIT Han Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..nn.vo_ops import build_kwargs_from_config
+
+__all__ = ["LayerNorm2d", "build_norm", "set_norm_eps"]
+
+
+class LayerNorm2d(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = x - torch.mean(x, dim=1, keepdim=True)
+        out = out / torch.sqrt(torch.square(out).mean(dim=1, keepdim=True) + self.eps)
+        if self.elementwise_affine:
+            out = out * self.weight.view(1, -1, 1, 1) + self.bias.view(1, -1, 1, 1)
+        return out
+
+
+
+class RMSNorm2d(nn.Module):
+    def __init__(
+        self, num_features: int, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True
+    ) -> None:
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = torch.nn.parameter.Parameter(torch.empty(self.num_features))
+            if bias:
+                self.bias = torch.nn.parameter.Parameter(torch.empty(self.num_features))
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x / torch.sqrt(torch.square(x.float()).mean(dim=1, keepdim=True) + self.eps)).to(x.dtype)
+        if self.elementwise_affine:
+            x = x * self.weight.view(1, -1, 1, 1) + self.bias.view(1, -1, 1, 1)
+        return x
+
+
+class RMSNorm3d(RMSNorm2d):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x / torch.sqrt(torch.square(x.float()).mean(dim=1, keepdim=True) + self.eps)).to(x.dtype)
+        if self.elementwise_affine:
+            x = x * self.weight.view(1, -1, 1, 1, 1) + self.bias.view(1, -1, 1, 1, 1)
+        return x
+
+
+# register normalization function here
+REGISTERED_NORM_DICT: dict[str, type] = {
+    "bn2d": nn.BatchNorm2d,
+    "ln": nn.LayerNorm,
+    "ln2d": LayerNorm2d,
+    "rms2d": RMSNorm2d,
+    "rms3d": RMSNorm3d,
+}
+
+
+def build_norm(name="bn2d", num_features=None, **kwargs) -> Optional[nn.Module]:
+    if name in ["ln", "ln2d"]:
+        kwargs["normalized_shape"] = num_features
+    else:
+        kwargs["num_features"] = num_features
+    if name in REGISTERED_NORM_DICT:
+        norm_cls = REGISTERED_NORM_DICT[name]
+        args = build_kwargs_from_config(kwargs, norm_cls)
+        return norm_cls(**args)
+    else:
+        return None
+
+
+def set_norm_eps(model: nn.Module, eps: Optional[float] = None) -> None:
+    for m in model.modules():
+        if isinstance(m, (nn.GroupNorm, nn.LayerNorm, _BatchNorm)):
+            if eps is not None:
+                m.eps = eps
diff --git a/opensora/models/dc_ae/models/nn/ops.py b/opensora/models/dc_ae/models/nn/ops.py
new file mode 100644
index 0000000..05133c9
--- /dev/null
+++ b/opensora/models/dc_ae/models/nn/ops.py
@@ -0,0 +1,978 @@
+# Copyright 2024 MIT Han Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 # upsample on the temporal dimension as well
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from opensora.models.vae.utils import ChannelChunkConv3d
+
+from ...models.nn.act import build_act
+from ...models.nn.norm import build_norm
+from ...models.nn.vo_ops import chunked_interpolate, get_same_padding, pixel_shuffle_3d, pixel_unshuffle_3d, resize
+from ...utils import list_sum, val2list, val2tuple
+
+__all__ = [
+    "ConvLayer",
+    "UpSampleLayer",
+    "ConvPixelUnshuffleDownSampleLayer",
+    "PixelUnshuffleChannelAveragingDownSampleLayer",
+    "ConvPixelShuffleUpSampleLayer",
+    "ChannelDuplicatingPixelShuffleUpSampleLayer",
+    "LinearLayer",
+    "IdentityLayer",
+    "DSConv",
+    "MBConv",
+    "FusedMBConv",
+    "ResBlock",
+    "LiteMLA",
+    "EfficientViTBlock",
+    "ResidualBlock",
+    "DAGBlock",
+    "OpSequential",
+]
+
+
+#################################################################################
+#                             Basic Layers                                      #
+#################################################################################
+
+
+class ConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        groups=1,
+        use_bias=False,
+        dropout=0,
+        norm="bn2d",
+        act_func="relu",
+        is_video=False,
+        pad_mode_3d="constant",
+    ):
+        super().__init__()
+        self.is_video = is_video
+
+        if self.is_video:
+            assert dilation == 1, "only support dilation=1 for 3d conv"
+            assert kernel_size % 2 == 1, "only support odd kernel size for 3d conv"
+            self.pad_mode_3d = pad_mode_3d  # 3d padding follows CausalConv3d by Hunyuan
+            # padding = (
+            #     kernel_size // 2,
+            #     kernel_size // 2,
+            #     kernel_size // 2,
+            #     kernel_size // 2,
+            #     kernel_size - 1,
+            #     0,
+            # )  # W, H, T
+            # non-causal padding
+            padding = (
+                kernel_size // 2,
+                kernel_size // 2,
+                kernel_size // 2,
+                kernel_size // 2,
+                kernel_size // 2,
+                kernel_size // 2,
+            )
+            self.padding = padding
+            self.dropout = nn.Dropout3d(dropout, inplace=False) if dropout > 0 else None
+            assert isinstance(stride, (int, tuple)), "stride must be an integer or 3-tuple for 3d conv"
+            self.conv = ChannelChunkConv3d(  # padding is handled by F.pad() in forward()
+                in_channels,
+                out_channels,
+                kernel_size=(kernel_size, kernel_size, kernel_size),
+                stride=(stride, stride, stride) if isinstance(stride, int) else stride,
+                groups=groups,
+                bias=use_bias,
+            )
+        else:
+            padding = get_same_padding(kernel_size)
+            padding *= dilation
+            self.dropout = nn.Dropout2d(dropout, inplace=False) if dropout > 0 else None
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=(kernel_size, kernel_size),
+                stride=(stride, stride),
+                padding=padding,
+                dilation=(dilation, dilation),
+                groups=groups,
+                bias=use_bias,
+            )
+
+        self.norm = build_norm(norm, num_features=out_channels)
+        self.act = build_act(act_func)
+        self.pad = F.pad
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dropout is not None:
+            x = self.dropout(x)
+        if self.is_video:  # custom padding for 3d conv
+            x = self.pad(x, self.padding, mode=self.pad_mode_3d)  # "constant" padding defaults to 0
+        x = self.conv(x)
+        if self.norm:
+            x = self.norm(x)
+        if self.act:
+            x = self.act(x)
+        return x
+
+
+class UpSampleLayer(nn.Module):
+    def __init__(
+        self,
+        mode="bicubic",
+        size: Optional[int | tuple[int, int] | list[int]] = None,
+        factor=2,
+        align_corners=False,
+    ):
+        super().__init__()
+        self.mode = mode
+        self.size = val2list(size, 2) if size is not None else None
+        self.factor = None if self.size is not None else factor
+        self.align_corners = align_corners
+
+    @torch.autocast(device_type="cuda", enabled=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if (self.size is not None and tuple(x.shape[-2:]) == self.size) or self.factor == 1:
+            return x
+        if x.dtype in [torch.float16, torch.bfloat16]:
+            x = x.float()
+        return resize(x, self.size, self.factor, self.mode, self.align_corners)
+
+
+class ConvPixelUnshuffleDownSampleLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        factor: int,
+    ):
+        super().__init__()
+        self.factor = factor
+        out_ratio = factor**2
+        assert out_channels % out_ratio == 0
+        self.conv = ConvLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // out_ratio,
+            kernel_size=kernel_size,
+            use_bias=True,
+            norm=None,
+            act_func=None,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        x = F.pixel_unshuffle(x, self.factor)
+        return x
+
+
+class PixelUnshuffleChannelAveragingDownSampleLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor: int,
+        temporal_downsample: bool = False,  # temporal downsample for 5d input tensor
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor = factor
+        self.temporal_downsample = temporal_downsample
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() == 4:
+            assert self.in_channels * self.factor**2 % self.out_channels == 0
+            group_size = self.in_channels * self.factor**2 // self.out_channels
+            x = F.pixel_unshuffle(x, self.factor)
+            B, C, H, W = x.shape
+            x = x.view(B, self.out_channels, group_size, H, W)
+            x = x.mean(dim=2)
+        elif x.dim() == 5:  # [B, C, T, H, W]
+            _, _, T, _, _ = x.shape
+            if self.temporal_downsample and T != 1:  # 3d pixel unshuffle
+                x = pixel_unshuffle_3d(x, self.factor)
+                assert self.in_channels * self.factor**3 % self.out_channels == 0
+                group_size = self.in_channels * self.factor**3 // self.out_channels
+            else:  # 2d pixel unshuffle
+                x = x.permute(0, 2, 1, 3, 4)  # [B, T, C, H, W]
+                x = F.pixel_unshuffle(x, self.factor)
+                x = x.permute(0, 2, 1, 3, 4)  # [B, C, T, H, W]
+                assert self.in_channels * self.factor**2 % self.out_channels == 0
+                group_size = self.in_channels * self.factor**2 // self.out_channels
+            B, C, T, H, W = x.shape
+            x = x.view(B, self.out_channels, group_size, T, H, W)
+            x = x.mean(dim=2)
+        else:
+            raise ValueError(f"Unsupported input dimension: {x.dim()}")
+        return x
+
+    def __repr__(self):
+        return f"PixelUnshuffleChannelAveragingDownSampleLayer(in_channels={self.in_channels}, out_channels={self.out_channels}, factor={self.factor}), temporal_downsample={self.temporal_downsample}"
+
+
+class ConvPixelShuffleUpSampleLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        factor: int,
+    ):
+        super().__init__()
+        self.factor = factor
+        out_ratio = factor**2
+        self.conv = ConvLayer(
+            in_channels=in_channels,
+            out_channels=out_channels * out_ratio,
+            kernel_size=kernel_size,
+            use_bias=True,
+            norm=None,
+            act_func=None,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        x = F.pixel_shuffle(x, self.factor)
+        return x
+
+
+class InterpolateConvUpSampleLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        factor: int,
+        mode: str = "nearest",
+        is_video: bool = False,
+        temporal_upsample: bool = False,
+    ) -> None:
+        super().__init__()
+        self.factor = factor
+        self.mode = mode
+        self.temporal_upsample = temporal_upsample
+        self.conv = ConvLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            use_bias=True,
+            norm=None,
+            act_func=None,
+            is_video=is_video,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() == 4:
+            x = F.interpolate(x, scale_factor=self.factor, mode=self.mode)
+        elif x.dim() == 5:
+            # [B, C, T, H, W] -> [B, C, T*factor, H*factor, W*factor]
+            if self.temporal_upsample and x.size(2) != 1:  # temporal upsample for video input
+                x = chunked_interpolate(x, scale_factor=[self.factor, self.factor, self.factor], mode=self.mode)
+            else:
+                x = chunked_interpolate(x, scale_factor=[1, self.factor, self.factor], mode=self.mode)
+        x = self.conv(x)
+        return x
+
+    def __repr__(self):
+        return f"InterpolateConvUpSampleLayer(factor={self.factor}, mode={self.mode}, temporal_upsample={self.temporal_upsample})"
+
+
+class ChannelDuplicatingPixelShuffleUpSampleLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor: int,
+        temporal_upsample: bool = False,  # upsample on the temporal dimension as well
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor = factor
+        assert out_channels * factor**2 % in_channels == 0
+        self.temporal_upsample = temporal_upsample
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() == 5:
+            B, C, T, H, W = x.shape
+            assert C == self.in_channels
+
+        if self.temporal_upsample and T != 1:  # video input
+            repeats = self.out_channels * self.factor**3 // self.in_channels
+        else:
+            repeats = self.out_channels * self.factor**2 // self.in_channels
+
+        x = x.repeat_interleave(repeats, dim=1)
+
+        if x.dim() == 4:  # original image-only training
+            x = F.pixel_shuffle(x, self.factor)
+        elif x.dim() == 5:  # [B, C, T, H, W]
+            if self.temporal_upsample and T != 1:  # video input
+                x = pixel_shuffle_3d(x, self.factor)
+            else:
+                x = x.permute(0, 2, 1, 3, 4)  # [B, T, C, H, W]
+                x = F.pixel_shuffle(x, self.factor)  # on H and W only
+                x = x.permute(0, 2, 1, 3, 4)  # [B, C, T, H, W]
+        return x
+
+    def __repr__(self):
+        return f"ChannelDuplicatingPixelShuffleUpSampleLayer(in_channels={self.in_channels}, out_channels={self.out_channels}, factor={self.factor}, temporal_upsample={self.temporal_upsample})"
+
+
+class LinearLayer(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        use_bias=True,
+        dropout=0,
+        norm=None,
+        act_func=None,
+    ):
+        super().__init__()
+
+        self.dropout = nn.Dropout(dropout, inplace=False) if dropout > 0 else None
+        self.linear = nn.Linear(in_features, out_features, use_bias)
+        self.norm = build_norm(norm, num_features=out_features)
+        self.act = build_act(act_func)
+
+    def _try_squeeze(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        return x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._try_squeeze(x)
+        if self.dropout:
+            x = self.dropout(x)
+        x = self.linear(x)
+        if self.norm:
+            x = self.norm(x)
+        if self.act:
+            x = self.act(x)
+        return x
+
+
+class IdentityLayer(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x
+
+
+#################################################################################
+#                             Basic Blocks                                      #
+#################################################################################
+
+
+class DSConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size=3,
+        stride=1,
+        use_bias=False,
+        norm=("bn2d", "bn2d"),
+        act_func=("relu6", None),
+    ):
+        super().__init__()
+
+        use_bias = val2tuple(use_bias, 2)
+        norm = val2tuple(norm, 2)
+        act_func = val2tuple(act_func, 2)
+
+        self.depth_conv = ConvLayer(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride,
+            groups=in_channels,
+            norm=norm[0],
+            act_func=act_func[0],
+            use_bias=use_bias[0],
+        )
+        self.point_conv = ConvLayer(
+            in_channels,
+            out_channels,
+            1,
+            norm=norm[1],
+            act_func=act_func[1],
+            use_bias=use_bias[1],
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.depth_conv(x)
+        x = self.point_conv(x)
+        return x
+
+
+class MBConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size=3,
+        stride=1,
+        mid_channels=None,
+        expand_ratio=6,
+        use_bias=False,
+        norm=("bn2d", "bn2d", "bn2d"),
+        act_func=("relu6", "relu6", None),
+    ):
+        super().__init__()
+
+        use_bias = val2tuple(use_bias, 3)
+        norm = val2tuple(norm, 3)
+        act_func = val2tuple(act_func, 3)
+        mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
+
+        self.inverted_conv = ConvLayer(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            norm=norm[0],
+            act_func=act_func[0],
+            use_bias=use_bias[0],
+        )
+        self.depth_conv = ConvLayer(
+            mid_channels,
+            mid_channels,
+            kernel_size,
+            stride=stride,
+            groups=mid_channels,
+            norm=norm[1],
+            act_func=act_func[1],
+            use_bias=use_bias[1],
+        )
+        self.point_conv = ConvLayer(
+            mid_channels,
+            out_channels,
+            1,
+            norm=norm[2],
+            act_func=act_func[2],
+            use_bias=use_bias[2],
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.inverted_conv(x)
+        x = self.depth_conv(x)
+        x = self.point_conv(x)
+        return x
+
+
+class FusedMBConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size=3,
+        stride=1,
+        mid_channels=None,
+        expand_ratio=6,
+        groups=1,
+        use_bias=False,
+        norm=("bn2d", "bn2d"),
+        act_func=("relu6", None),
+    ):
+        super().__init__()
+        use_bias = val2tuple(use_bias, 2)
+        norm = val2tuple(norm, 2)
+        act_func = val2tuple(act_func, 2)
+
+        mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
+
+        self.spatial_conv = ConvLayer(
+            in_channels,
+            mid_channels,
+            kernel_size,
+            stride,
+            groups=groups,
+            use_bias=use_bias[0],
+            norm=norm[0],
+            act_func=act_func[0],
+        )
+        self.point_conv = ConvLayer(
+            mid_channels,
+            out_channels,
+            1,
+            use_bias=use_bias[1],
+            norm=norm[1],
+            act_func=act_func[1],
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.spatial_conv(x)
+        x = self.point_conv(x)
+        return x
+
+
+class GLUMBConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size=3,
+        stride=1,
+        mid_channels=None,
+        expand_ratio=6,
+        use_bias=False,
+        norm=(None, None, "ln2d"),
+        act_func=("silu", "silu", None),
+        is_video=False,
+    ):
+        super().__init__()
+        use_bias = val2tuple(use_bias, 3)
+        norm = val2tuple(norm, 3)
+        act_func = val2tuple(act_func, 3)
+
+        mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
+
+        self.glu_act = build_act(act_func[1], inplace=False)
+        self.inverted_conv = ConvLayer(
+            in_channels,
+            mid_channels * 2,
+            1,
+            use_bias=use_bias[0],
+            norm=norm[0],
+            act_func=act_func[0],
+            is_video=is_video,
+        )
+        self.depth_conv = ConvLayer(
+            mid_channels * 2,
+            mid_channels * 2,
+            kernel_size,
+            stride=stride,
+            groups=mid_channels * 2,
+            use_bias=use_bias[1],
+            norm=norm[1],
+            act_func=None,
+            is_video=is_video,
+        )
+        self.point_conv = ConvLayer(
+            mid_channels,
+            out_channels,
+            1,
+            use_bias=use_bias[2],
+            norm=norm[2],
+            act_func=act_func[2],
+            is_video=is_video,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.inverted_conv(x)
+        x = self.depth_conv(x)
+
+        x, gate = torch.chunk(x, 2, dim=1)
+        gate = self.glu_act(gate)
+        x = x * gate
+
+        x = self.point_conv(x)
+        return x
+
+
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size=3,
+        stride=1,
+        mid_channels=None,
+        expand_ratio=1,
+        use_bias=False,
+        norm=("bn2d", "bn2d"),
+        act_func=("relu6", None),
+        is_video=False,
+    ):
+        super().__init__()
+        use_bias = val2tuple(use_bias, 2)
+        norm = val2tuple(norm, 2)
+        act_func = val2tuple(act_func, 2)
+
+        mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
+
+        self.conv1 = ConvLayer(
+            in_channels,
+            mid_channels,
+            kernel_size,
+            stride,
+            use_bias=use_bias[0],
+            norm=norm[0],
+            act_func=act_func[0],
+            is_video=is_video,
+        )
+        self.conv2 = ConvLayer(
+            mid_channels,
+            out_channels,
+            kernel_size,
+            1,
+            use_bias=use_bias[1],
+            norm=norm[1],
+            act_func=act_func[1],
+            is_video=is_video,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class LiteMLA(nn.Module):
+    r"""Lightweight multi-scale linear attention"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        heads: Optional[int] = None,
+        heads_ratio: float = 1.0,
+        dim=8,
+        use_bias=False,
+        norm=(None, "bn2d"),
+        act_func=(None, None),
+        kernel_func="relu",
+        scales: tuple[int, ...] = (5,),
+        eps=1.0e-15,
+        is_video=False,
+    ):
+        super().__init__()
+        self.eps = eps
+        heads = int(in_channels // dim * heads_ratio) if heads is None else heads
+
+        total_dim = heads * dim
+
+        use_bias = val2tuple(use_bias, 2)
+        norm = val2tuple(norm, 2)
+        act_func = val2tuple(act_func, 2)
+
+        self.dim = dim
+        self.qkv = ConvLayer(
+            in_channels,
+            3 * total_dim,
+            1,
+            use_bias=use_bias[0],
+            norm=norm[0],
+            act_func=act_func[0],
+            is_video=is_video,
+        )
+        conv_class = nn.Conv2d if not is_video else ChannelChunkConv3d
+        self.aggreg = nn.ModuleList(
+            [
+                nn.Sequential(
+                    conv_class(
+                        3 * total_dim,
+                        3 * total_dim,
+                        scale,
+                        padding=get_same_padding(scale),
+                        groups=3 * total_dim,
+                        bias=use_bias[0],
+                    ),
+                    conv_class(3 * total_dim, 3 * total_dim, 1, groups=3 * heads, bias=use_bias[0]),
+                )
+                for scale in scales
+            ]
+        )
+        self.kernel_func = build_act(kernel_func, inplace=False)
+
+        self.proj = ConvLayer(
+            total_dim * (1 + len(scales)),
+            out_channels,
+            1,
+            use_bias=use_bias[1],
+            norm=norm[1],
+            act_func=act_func[1],
+            is_video=is_video,
+        )
+
+    @torch.autocast(device_type="cuda", enabled=False)
+    def relu_linear_att(self, qkv: torch.Tensor) -> torch.Tensor:
+        if qkv.ndim == 5:
+            B, _, T, H, W = list(qkv.size())
+            is_video = True
+        else:
+            B, _, H, W = list(qkv.size())
+            is_video = False
+
+        if qkv.dtype == torch.float16:
+            qkv = qkv.float()
+
+        if qkv.ndim == 4:
+            qkv = torch.reshape(
+                qkv,
+                (
+                    B,
+                    -1,
+                    3 * self.dim,
+                    H * W,
+                ),
+            )
+        elif qkv.ndim == 5:
+            qkv = torch.reshape(
+                qkv,
+                (
+                    B,
+                    -1,
+                    3 * self.dim,
+                    H * W * T,
+                ),
+            )
+        q, k, v = (
+            qkv[:, :, 0 : self.dim],
+            qkv[:, :, self.dim : 2 * self.dim],
+            qkv[:, :, 2 * self.dim :],
+        )
+
+        # lightweight linear attention
+        q = self.kernel_func(q)
+        k = self.kernel_func(k)
+
+        # linear matmul
+        trans_k = k.transpose(-1, -2)
+
+        v = F.pad(v, (0, 0, 0, 1), mode="constant", value=1)
+        vk = torch.matmul(v, trans_k)
+        out = torch.matmul(vk, q)
+        if out.dtype == torch.bfloat16:
+            out = out.float()
+        out = out[:, :, :-1] / (out[:, :, -1:] + self.eps)
+
+        if not is_video:
+            out = torch.reshape(out, (B, -1, H, W))
+        else:
+            out = torch.reshape(out, (B, -1, T, H, W))
+        return out
+
+    @torch.autocast(device_type="cuda", enabled=False)
+    def relu_quadratic_att(self, qkv: torch.Tensor) -> torch.Tensor:
+        B, _, H, W = list(qkv.size())
+
+        qkv = torch.reshape(
+            qkv,
+            (
+                B,
+                -1,
+                3 * self.dim,
+                H * W,
+            ),
+        )
+        q, k, v = (
+            qkv[:, :, 0 : self.dim],
+            qkv[:, :, self.dim : 2 * self.dim],
+            qkv[:, :, 2 * self.dim :],
+        )
+
+        q = self.kernel_func(q)
+        k = self.kernel_func(k)
+
+        att_map = torch.matmul(k.transpose(-1, -2), q)  # b h n n
+        original_dtype = att_map.dtype
+        if original_dtype in [torch.float16, torch.bfloat16]:
+            att_map = att_map.float()
+        att_map = att_map / (torch.sum(att_map, dim=2, keepdim=True) + self.eps)  # b h n n
+        att_map = att_map.to(original_dtype)
+        out = torch.matmul(v, att_map)  # b h d n
+
+        out = torch.reshape(out, (B, -1, H, W))
+        return out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # generate multi-scale q, k, v
+        qkv = self.qkv(x)
+        multi_scale_qkv = [qkv]
+        for op in self.aggreg:
+            multi_scale_qkv.append(op(qkv))
+        qkv = torch.cat(multi_scale_qkv, dim=1)
+
+        if qkv.ndim == 4:
+            H, W = list(qkv.size())[-2:]
+            # num_tokens = H * W
+        elif qkv.ndim == 5:
+            _, _, T, H, W = list(qkv.size())
+            # num_tokens = H * W * T
+
+        # if num_tokens > self.dim:
+        out = self.relu_linear_att(qkv).to(qkv.dtype)
+        # else:
+        #     if self.is_video:
+        #         raise NotImplementedError("Video is not supported for quadratic attention")
+        #     out = self.relu_quadratic_att(qkv)
+        out = self.proj(out)
+
+        return out
+
+
+class EfficientViTBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        heads_ratio: float = 1.0,
+        dim=32,
+        expand_ratio: float = 4,
+        scales: tuple[int, ...] = (5,),
+        norm: str = "bn2d",
+        act_func: str = "hswish",
+        context_module: str = "LiteMLA",
+        local_module: str = "MBConv",
+        is_video: bool = False,
+    ):
+        super().__init__()
+        if context_module == "LiteMLA":
+            self.context_module = ResidualBlock(
+                LiteMLA(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    heads_ratio=heads_ratio,
+                    dim=dim,
+                    norm=(None, norm),
+                    scales=scales,
+                    is_video=is_video,
+                ),
+                IdentityLayer(),
+            )
+        else:
+            raise ValueError(f"context_module {context_module} is not supported")
+        if local_module == "MBConv":
+            self.local_module = ResidualBlock(
+                MBConv(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    expand_ratio=expand_ratio,
+                    use_bias=(True, True, False),
+                    norm=(None, None, norm),
+                    act_func=(act_func, act_func, None),
+                    is_video=is_video,
+                ),
+                IdentityLayer(),
+            )
+        elif local_module == "GLUMBConv":
+            self.local_module = ResidualBlock(
+                GLUMBConv(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    expand_ratio=expand_ratio,
+                    use_bias=(True, True, False),
+                    norm=(None, None, norm),
+                    act_func=(act_func, act_func, None),
+                    is_video=is_video,
+                ),
+                IdentityLayer(),
+            )
+        else:
+            raise NotImplementedError(f"local_module {local_module} is not supported")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.context_module(x)
+        x = self.local_module(x)
+        return x
+
+
+#################################################################################
+#                             Functional Blocks                                 #
+#################################################################################
+
+
+class ResidualBlock(nn.Module):
+    def __init__(
+        self,
+        main: Optional[nn.Module],
+        shortcut: Optional[nn.Module],
+        post_act=None,
+        pre_norm: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+
+        self.pre_norm = pre_norm
+        self.main = main
+        self.shortcut = shortcut
+        self.post_act = build_act(post_act)
+
+    def forward_main(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pre_norm is None:
+            return self.main(x)
+        else:
+            return self.main(self.pre_norm(x))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.main is None:
+            res = x
+        elif self.shortcut is None:
+            res = self.forward_main(x)
+        else:
+            res = self.forward_main(x) + self.shortcut(x)
+            if self.post_act:
+                res = self.post_act(res)
+        return res
+
+
+class DAGBlock(nn.Module):
+    def __init__(
+        self,
+        inputs: dict[str, nn.Module],
+        merge: str,
+        post_input: Optional[nn.Module],
+        middle: nn.Module,
+        outputs: dict[str, nn.Module],
+    ):
+        super().__init__()
+
+        self.input_keys = list(inputs.keys())
+        self.input_ops = nn.ModuleList(list(inputs.values()))
+        self.merge = merge
+        self.post_input = post_input
+
+        self.middle = middle
+
+        self.output_keys = list(outputs.keys())
+        self.output_ops = nn.ModuleList(list(outputs.values()))
+
+    def forward(self, feature_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        feat = [op(feature_dict[key]) for key, op in zip(self.input_keys, self.input_ops)]
+        if self.merge == "add":
+            feat = list_sum(feat)
+        elif self.merge == "cat":
+            feat = torch.concat(feat, dim=1)
+        else:
+            raise NotImplementedError
+        if self.post_input is not None:
+            feat = self.post_input(feat)
+        feat = self.middle(feat)
+        for key, op in zip(self.output_keys, self.output_ops):
+            feature_dict[key] = op(feat)
+        return feature_dict
+
+
+class OpSequential(nn.Module):
+    def __init__(self, op_list: list[Optional[nn.Module]]):
+        super().__init__()
+        valid_op_list = []
+        for op in op_list:
+            if op is not None:
+                valid_op_list.append(op)
+        self.op_list = nn.ModuleList(valid_op_list)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for op in self.op_list:
+            x = op(x)
+        return x
diff --git a/opensora/models/dc_ae/models/nn/vo_ops.py b/opensora/models/dc_ae/models/nn/vo_ops.py
new file mode 100644
index 0000000..343a54d
--- /dev/null
+++ b/opensora/models/dc_ae/models/nn/vo_ops.py
@@ -0,0 +1,244 @@
+import math
+from inspect import signature
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+
+VERBOSE = False
+
+
+def pixel_shuffle_3d(x, upscale_factor):
+    """
+    3D pixelshuffle 操作。
+    """
+    B, C, T, H, W = x.shape
+    r = upscale_factor
+    assert C % (r * r * r) == 0, "通道数必须是上采样因子的立方倍数"
+
+    C_new = C // (r * r * r)
+    x = x.view(B, C_new, r, r, r, T, H, W)
+    if VERBOSE:
+        print("x.view:")
+        print(x)
+        print("x.view.shape:")
+        print(x.shape)
+
+    x = x.permute(0, 1, 5, 2, 6, 3, 7, 4)
+    if VERBOSE:
+        print("x.permute:")
+        print(x)
+        print("x.permute.shape:")
+        print(x.shape)
+
+    y = x.reshape(B, C_new, T * r, H * r, W * r)
+    return y
+
+
+def pixel_unshuffle_3d(x, downsample_factor):
+    """
+    3D pixel unshuffle 操作。
+    """
+    B, C, T, H, W = x.shape
+
+    r = downsample_factor
+    assert T % r == 0, f"时间维度必须是下采样因子的倍数, got shape {x.shape}"
+    assert H % r == 0, f"高度维度必须是下采样因子的倍数, got shape {x.shape}"
+    assert W % r == 0, f"宽度维度必须是下采样因子的倍数, got shape {x.shape}"
+    T_new = T // r
+    H_new = H // r
+    W_new = W // r
+    C_new = C * (r * r * r)
+
+    x = x.view(B, C, T_new, r, H_new, r, W_new, r)
+    x = x.permute(0, 1, 3, 5, 7, 2, 4, 6)
+    y = x.reshape(B, C_new, T_new, H_new, W_new)
+    return y
+
+
+def test_pixel_shuffle_3d():
+    # 输入张量 (B, C, T, H, W) = (1, 16, 2, 4, 4)
+    x = torch.arange(1, 1 + 1 * 16 * 2 * 4 * 4).view(1, 16, 2, 4, 4).float()
+    print("x:")
+    print(x)
+    print("x.shape:")
+    print(x.shape)
+
+    upscale_factor = 2
+
+    # 使用自定义 pixelshuffle_3d
+    y = pixel_shuffle_3d(x, upscale_factor)
+    print("pixelshuffle_3d 结果:")
+    print(y)
+    print("输出形状:", y.shape)
+    # 预期输出形状: (1, 1, 4, 8, 8)
+    # 因为:
+    # - 通道数从8变为1 (8 /(2*2*2))
+    # - 时间维度从2变为4 (2*2)
+    # - 高度从4变为8 (4*2)
+    # - 宽度从4变为8 (4*2)
+
+    print(torch.allclose(x, pixel_unshuffle_3d(y, upscale_factor)))
+
+
+def chunked_interpolate(x, scale_factor, mode="nearest"):
+    """
+    Interpolate large tensors by chunking along the channel dimension. https://discuss.pytorch.org/t/error-using-f-interpolate-for-large-3d-input/207859
+    Only supports 'nearest' interpolation mode.
+
+    Args:
+        x (torch.Tensor): Input tensor (B, C, D, H, W)
+        scale_factor: Tuple of scaling factors (d, h, w)
+
+    Returns:
+        torch.Tensor: Interpolated tensor
+    """
+    assert (
+        mode == "nearest"
+    ), "Only the nearest mode is supported"  # actually other modes are theoretically supported but not tested
+    if len(x.shape) != 5:
+        raise ValueError("Expected 5D input tensor (B, C, D, H, W)")
+
+    # Calculate max chunk size to avoid int32 overflow. num_elements < max_int32
+    # Max int32 is 2^31 - 1
+    max_elements_per_chunk = 2**31 - 1
+
+    # Calculate output spatial dimensions
+    out_d = math.ceil(x.shape[2] * scale_factor[0])
+    out_h = math.ceil(x.shape[3] * scale_factor[1])
+    out_w = math.ceil(x.shape[4] * scale_factor[2])
+
+    # Calculate max channels per chunk to stay under limit
+    elements_per_channel = out_d * out_h * out_w
+    max_channels = max_elements_per_chunk // (x.shape[0] * elements_per_channel)
+
+    # Use smaller of max channels or input channels
+    chunk_size = min(max_channels, x.shape[1])
+
+    # Ensure at least 1 channel per chunk
+    chunk_size = max(1, chunk_size)
+    if VERBOSE:
+        print(f"Input channels: {x.shape[1]}")
+        print(f"Chunk size: {chunk_size}")
+        print(f"max_channels: {max_channels}")
+        print(f"num_chunks: {math.ceil(x.shape[1] / chunk_size)}")
+
+    chunks = []
+    for i in range(0, x.shape[1], chunk_size):
+        start_idx = i
+        end_idx = min(i + chunk_size, x.shape[1])
+
+        chunk = x[:, start_idx:end_idx, :, :, :]
+
+        interpolated_chunk = F.interpolate(chunk, scale_factor=scale_factor, mode="nearest")
+
+        chunks.append(interpolated_chunk)
+
+    if not chunks:
+        raise ValueError(f"No chunks were generated. Input shape: {x.shape}")
+
+    # Concatenate chunks along channel dimension
+    return torch.cat(chunks, dim=1)
+
+
+def test_chunked_interpolate():
+    # Test case 1: Basic upscaling with scale_factor
+    x1 = torch.randn(2, 16, 16, 32, 32).cuda()
+    scale_factor = (2.0, 2.0, 2.0)
+    assert torch.allclose(
+        chunked_interpolate(x1, scale_factor=scale_factor), F.interpolate(x1, scale_factor=scale_factor, mode="nearest")
+    )
+
+    # Test case 3: Downscaling with scale_factor
+    x3 = torch.randn(2, 16, 32, 64, 64).cuda()
+    scale_factor = (0.5, 0.5, 0.5)
+    assert torch.allclose(
+        chunked_interpolate(x3, scale_factor=scale_factor), F.interpolate(x3, scale_factor=scale_factor, mode="nearest")
+    )
+
+    # Test case 4: Different scales per dimension
+    x4 = torch.randn(2, 16, 16, 32, 32).cuda()
+    scale_factor = (2.0, 1.5, 1.5)
+    assert torch.allclose(
+        chunked_interpolate(x4, scale_factor=scale_factor), F.interpolate(x4, scale_factor=scale_factor, mode="nearest")
+    )
+
+    # Test case 5: Large input tensor
+    x5 = torch.randn(2, 16, 64, 128, 128).cuda()
+    scale_factor = (2.0, 2.0, 2.0)
+    assert torch.allclose(
+        chunked_interpolate(x5, scale_factor=scale_factor), F.interpolate(x5, scale_factor=scale_factor, mode="nearest")
+    )
+
+    # Test case 7: Chunk size equal to input depth
+    x7 = torch.randn(2, 16, 8, 32, 32).cuda()
+    scale_factor = (2.0, 2.0, 2.0)
+    assert torch.allclose(
+        chunked_interpolate(x7, scale_factor=scale_factor), F.interpolate(x7, scale_factor=scale_factor, mode="nearest")
+    )
+
+    # Test case 8: Single channel input
+    x8 = torch.randn(2, 1, 16, 32, 32).cuda()
+    scale_factor = (2.0, 2.0, 2.0)
+    assert torch.allclose(
+        chunked_interpolate(x8, scale_factor=scale_factor), F.interpolate(x8, scale_factor=scale_factor, mode="nearest")
+    )
+
+    # Test case 9: Minimal batch size
+    x9 = torch.randn(1, 16, 32, 64, 64).cuda()
+    scale_factor = (0.5, 0.5, 0.5)
+    assert torch.allclose(
+        chunked_interpolate(x9, scale_factor=scale_factor), F.interpolate(x9, scale_factor=scale_factor, mode="nearest")
+    )
+
+    # Test case 10: Non-power-of-2 dimensions
+    x10 = torch.randn(2, 16, 15, 31, 31).cuda()
+    scale_factor = (2.0, 2.0, 2.0)
+    assert torch.allclose(
+        chunked_interpolate(x10, scale_factor=scale_factor),
+        F.interpolate(x10, scale_factor=scale_factor, mode="nearest"),
+    )
+
+    # Test case 11: large output tensor
+
+
+def get_same_padding(kernel_size: Union[int, tuple[int, ...]]) -> Union[int, tuple[int, ...]]:
+    if isinstance(kernel_size, tuple):
+        return tuple([get_same_padding(ks) for ks in kernel_size])
+    else:
+        assert kernel_size % 2 > 0, "kernel size should be odd number"
+        return kernel_size // 2
+
+
+def resize(
+    x: torch.Tensor,
+    size: Optional[Any] = None,
+    scale_factor: Optional[list[float]] = None,
+    mode: str = "bicubic",
+    align_corners: Optional[bool] = False,
+) -> torch.Tensor:
+    if mode in {"bilinear", "bicubic"}:
+        return F.interpolate(
+            x,
+            size=size,
+            scale_factor=scale_factor,
+            mode=mode,
+            align_corners=align_corners,
+        )
+    elif mode in {"nearest", "area"}:
+        return F.interpolate(x, size=size, scale_factor=scale_factor, mode=mode)
+    else:
+        raise NotImplementedError(f"resize(mode={mode}) not implemented.")
+
+
+def build_kwargs_from_config(config: dict, target_func: Callable) -> dict[str, Any]:
+    valid_keys = list(signature(target_func).parameters)
+    kwargs = {}
+    for key in config:
+        if key in valid_keys:
+            kwargs[key] = config[key]
+    return kwargs
+
+
+if __name__ == "__main__":
+    test_chunked_interpolate()
diff --git a/opensora/models/dc_ae/utils/__init__.py b/opensora/models/dc_ae/utils/__init__.py
new file mode 100644
index 0000000..eafb232
--- /dev/null
+++ b/opensora/models/dc_ae/utils/__init__.py
@@ -0,0 +1,3 @@
+from .init import *
+from .list import *
+
diff --git a/opensora/models/dc_ae/utils/init.py b/opensora/models/dc_ae/utils/init.py
new file mode 100644
index 0000000..de650d4
--- /dev/null
+++ b/opensora/models/dc_ae/utils/init.py
@@ -0,0 +1,63 @@
+# Copyright 2024 MIT Han Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.batchnorm import _BatchNorm
+
+__all__ = ["init_modules"]
+
+
+def init_modules(model: Union[nn.Module, list[nn.Module]], init_type="trunc_normal") -> None:
+    _DEFAULT_INIT_PARAM = {"trunc_normal": 0.02}
+
+    if isinstance(model, list):
+        for sub_module in model:
+            init_modules(sub_module, init_type)
+    else:
+        init_params = init_type.split("@")
+        init_params = float(init_params[1]) if len(init_params) > 1 else None
+
+        if init_type.startswith("trunc_normal"):
+            init_func = lambda param: nn.init.trunc_normal_(
+                param, std=(_DEFAULT_INIT_PARAM["trunc_normal"] if init_params is None else init_params)
+            )
+        elif init_type.startswith("normal"):
+            init_func = lambda param: nn.init.normal_(
+                param, std=(_DEFAULT_INIT_PARAM["trunc_normal"] if init_params is None else init_params)
+            )
+        else:
+            raise NotImplementedError
+
+        for m in model.modules():
+            if isinstance(m, (nn.Conv2d, nn.Linear, nn.ConvTranspose2d)):
+                init_func(m.weight)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Embedding):
+                init_func(m.weight)
+            elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            else:
+                weight = getattr(m, "weight", None)
+                bias = getattr(m, "bias", None)
+                if isinstance(weight, torch.nn.Parameter):
+                    init_func(weight)
+                if isinstance(bias, torch.nn.Parameter):
+                    bias.data.zero_()
\ No newline at end of file
diff --git a/opensora/models/dc_ae/utils/list.py b/opensora/models/dc_ae/utils/list.py
new file mode 100644
index 0000000..cb58fbe
--- /dev/null
+++ b/opensora/models/dc_ae/utils/list.py
@@ -0,0 +1,68 @@
+# Copyright 2024 MIT Han Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Union
+
+__all__ = [
+    "list_sum",
+    "list_mean",
+    "weighted_list_sum",
+    "list_join",
+    "val2list",
+    "val2tuple",
+    "squeeze_list",
+]
+
+
+def list_sum(x: list) -> Any:
+    return x[0] if len(x) == 1 else x[0] + list_sum(x[1:])
+
+
+def list_mean(x: list) -> Any:
+    return list_sum(x) / len(x)
+
+
+def weighted_list_sum(x: list, weights: list) -> Any:
+    assert len(x) == len(weights)
+    return x[0] * weights[0] if len(x) == 1 else x[0] * weights[0] + weighted_list_sum(x[1:], weights[1:])
+
+
+def list_join(x: list, sep="\t", format_str="%s") -> str:
+    return sep.join([format_str % val for val in x])
+
+
+def val2list(x: Union[list, tuple, Any], repeat_time=1) -> list:
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    return [x for _ in range(repeat_time)]
+
+
+def val2tuple(x: Union[list, tuple, Any], min_len: int = 1, idx_repeat: int = -1) -> tuple:
+    x = val2list(x)
+
+    # repeat elements if necessary
+    if len(x) > 0:
+        x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
+
+    return tuple(x)
+
+
+def squeeze_list(x: Optional[list]) -> Union[list, Any]:
+    if x is not None and len(x) == 1:
+        return x[0]
+    else:
+        return x
+
diff --git a/opensora/models/hunyuan_vae/__init__.py b/opensora/models/hunyuan_vae/__init__.py
new file mode 100644
index 0000000..73d9765
--- /dev/null
+++ b/opensora/models/hunyuan_vae/__init__.py
@@ -0,0 +1,5 @@
+from pathlib import Path
+
+import torch
+
+from .autoencoder_kl_causal_3d import CausalVAE3D_HUNYUAN
diff --git a/opensora/models/hunyuan_vae/autoencoder_kl_causal_3d.py b/opensora/models/hunyuan_vae/autoencoder_kl_causal_3d.py
new file mode 100644
index 0000000..9a5a08f
--- /dev/null
+++ b/opensora/models/hunyuan_vae/autoencoder_kl_causal_3d.py
@@ -0,0 +1,638 @@
+# Modified from diffusers==0.29.2 and HunyuanVideo
+#
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright 2024 HunyuanVideo
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+
+from opensora.registry import MODELS
+from opensora.utils.ckpt import load_checkpoint
+
+try:
+    # This diffusers is modified and packed in the mirror.
+    from diffusers.loaders import FromOriginalVAEMixin
+except ImportError:
+    # Use this to be compatible with the original diffusers.
+    from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
+
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.accelerate_utils import apply_forward_hook
+
+from opensora.models.hunyuan_vae.vae import (
+    DecoderCausal3D,
+    DecoderOutput,
+    DiagonalGaussianDistribution,
+    EncoderCausal3D,
+)
+
+
+@dataclass
+class AutoEncoder3DConfig:
+    from_pretrained: str | None
+    act_fn: str = "silu"
+    in_channels: int = 3
+    out_channels: int = 3
+    latent_channels: int = 16
+    layers_per_block: int = 2
+    norm_num_groups: int = 32
+    scale_factor: float = 0.476986
+    shift_factor: float = 0
+    time_compression_ratio: int = 4
+    spatial_compression_ratio: int = 8
+    mid_block_add_attention: bool = True
+    block_out_channels: tuple[int] = (128, 256, 512, 512)
+    sample_size: int = 256
+    sample_tsize: int = 64
+    use_slicing: bool = False
+    use_spatial_tiling: bool = False
+    use_temporal_tiling: bool = False
+    tile_overlap_factor: float = 0.25
+    dropout: float = 0.0
+    channel: bool = False
+
+
+class AutoencoderKLCausal3D(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+    r"""
+    A VAE model with KL loss for encoding images/videos into latents and decoding latent representations into images/videos.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(self, config: AutoEncoder3DConfig):
+        super().__init__()
+
+        self.scale_factor = config.scale_factor
+        self.shift_factor = config.shift_factor
+
+        self.time_compression_ratio = config.time_compression_ratio
+        self.spatial_compression_ratio = config.spatial_compression_ratio
+        self.z_channels = config.latent_channels
+
+        self.encoder = EncoderCausal3D(
+            in_channels=config.in_channels,
+            out_channels=config.latent_channels,
+            block_out_channels=config.block_out_channels,
+            layers_per_block=config.layers_per_block,
+            act_fn=config.act_fn,
+            norm_num_groups=config.norm_num_groups,
+            double_z=True,
+            time_compression_ratio=config.time_compression_ratio,
+            spatial_compression_ratio=config.spatial_compression_ratio,
+            mid_block_add_attention=config.mid_block_add_attention,
+            dropout=config.dropout,
+        )
+
+        self.decoder = DecoderCausal3D(
+            in_channels=config.latent_channels,
+            out_channels=config.out_channels,
+            block_out_channels=config.block_out_channels,
+            layers_per_block=config.layers_per_block,
+            norm_num_groups=config.norm_num_groups,
+            act_fn=config.act_fn,
+            time_compression_ratio=config.time_compression_ratio,
+            spatial_compression_ratio=config.spatial_compression_ratio,
+            mid_block_add_attention=config.mid_block_add_attention,
+            dropout=config.dropout,
+        )
+
+        self.quant_conv = nn.Conv3d(2 * config.latent_channels, 2 * config.latent_channels, kernel_size=1)
+        self.post_quant_conv = nn.Conv3d(config.latent_channels, config.latent_channels, kernel_size=1)
+
+        self.use_slicing = config.use_slicing
+        self.use_spatial_tiling = config.use_spatial_tiling
+        self.use_temporal_tiling = config.use_temporal_tiling
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_tsize = config.sample_tsize
+        self.tile_latent_min_tsize = config.sample_tsize // config.time_compression_ratio
+
+        self.tile_sample_min_size = config.sample_size
+        sample_size = config.sample_size[0] if isinstance(config.sample_size, (list, tuple)) else config.sample_size
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(config.block_out_channels) - 1)))
+        self.tile_overlap_factor = config.tile_overlap_factor
+
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger videos.
+        """
+        self.enable_spatial_tiling(use_tiling)
+        self.enable_temporal_tiling(use_tiling)
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.disable_spatial_tiling()
+        self.disable_temporal_tiling()
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    @apply_forward_hook
+    def encode(
+        self,
+        x: torch.FloatTensor,
+        sample_posterior: bool = True,
+        return_posterior: bool = False,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[torch.FloatTensor, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images/videos into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images/videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        assert len(x.shape) == 5, "The input tensor should have 5 dimensions."
+
+        if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
+            posterior = self.temporal_tiled_encode(x)
+        elif self.use_spatial_tiling and (
+            x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size
+        ):
+            posterior = self.spatial_tiled_encode(x)
+        else:
+            if self.use_slicing and x.shape[0] > 1:
+                encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+                h = torch.cat(encoded_slices)
+            else:
+                h = self.encoder(x)
+            moments = self.quant_conv(h)
+            posterior = DiagonalGaussianDistribution(moments)
+
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+
+        z = self.scale_factor * (z - self.shift_factor)  # shift & scale
+
+        if return_posterior:
+            return z, posterior
+        else:
+            return z
+
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        assert len(z.shape) == 5, "The input tensor should have 5 dimensions."
+
+        if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
+            return self.temporal_tiled_decode(z, return_dict=return_dict)
+
+        if self.use_spatial_tiling and (
+            z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size
+        ):
+            return self.spatial_tiled_decode(z, return_dict=return_dict)
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(self, z: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Decode a batch of images/videos.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        z = z / self.scale_factor + self.shift_factor  # scale & shift
+
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        return decoded
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
+                x / blend_extent
+            )
+        return b
+
+    def spatial_tiled_encode(self, x: torch.FloatTensor, return_moments: bool = False) -> DiagonalGaussianDistribution:
+        r"""Encode a batch of images/videos using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image/videos size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split video into tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[-2], overlap_size):
+            row = []
+            for j in range(0, x.shape[-1], overlap_size):
+                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        moments = torch.cat(result_rows, dim=-2)
+        if return_moments:
+            return moments
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+
+    def spatial_tiled_decode(
+        self, z: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images/videos using a tiled decoder.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[-2], overlap_size):
+            row = []
+            for j in range(0, z.shape[-1], overlap_size):
+                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        dec = torch.cat(result_rows, dim=-2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def temporal_tiled_encode(self, x: torch.FloatTensor) -> DiagonalGaussianDistribution:
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_latent_min_tsize - blend_extent
+
+        # Split the video into tiles and encode them separately.
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i : i + self.tile_sample_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size
+            ):
+                tile = self.spatial_tiled_encode(tile, return_moments=True)
+            else:
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+        moments = torch.cat(result_row, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+
+    def temporal_tiled_decode(
+        self, z: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        # Split z into overlapping tiles and decode them separately.
+
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_sample_min_tsize - blend_extent
+
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i : i + self.tile_latent_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (
+                tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size
+            ):
+                decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
+            else:
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+            if i > 0:
+                decoded = decoded[:, :, 1:, :, :]
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, : t_limit + 1, :, :])
+
+        dec = torch.cat(result_row, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.FloatTensor, DiagonalGaussianDistribution, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        z, posterior = self.encode(x, return_posterior=True, sample_posterior=sample_posterior, generator=generator)
+        dec = self.decode(z)
+
+        return (dec, posterior, z)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.conv.weight
+
+    def get_latent_size(self, input_size: list[int]) -> list[int]:
+        latent_size = []
+        # T
+        latent_size.append((input_size[0] - 1) // self.time_compression_ratio + 1)
+        # H, w
+        for i in range(1, 3):
+            latent_size.append((input_size[i] - 1) // self.spatial_compression_ratio + 1)
+        return latent_size
+
+
+@MODELS.register_module("hunyuan_vae")
+def CausalVAE3D_HUNYUAN(
+    from_pretrained: str = None,
+    device_map: str | torch.device = "cuda",
+    torch_dtype: torch.dtype = torch.bfloat16,
+    **kwargs,
+) -> AutoencoderKLCausal3D:
+    config = AutoEncoder3DConfig(from_pretrained=from_pretrained, **kwargs)
+    with torch.device(device_map):
+        model = AutoencoderKLCausal3D(config).to(torch_dtype)
+    if from_pretrained:
+        model = load_checkpoint(model, from_pretrained, device_map=device_map, strict=True)
+
+    return model
diff --git a/opensora/models/hunyuan_vae/distributed.py b/opensora/models/hunyuan_vae/distributed.py
new file mode 100644
index 0000000..c68cb58
--- /dev/null
+++ b/opensora/models/hunyuan_vae/distributed.py
@@ -0,0 +1,580 @@
+from typing import List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
+from colossalai.shardformer.layer.attn import RingComm, _rescale_out_lse
+from colossalai.shardformer.layer.utils import SeqParallelUtils
+from diffusers.models.attention_processor import Attention
+
+from opensora.models.vae.tensor_parallel import Conv3dTPRow
+from opensora.models.vae.utils import get_conv3d_n_chunks
+
+from .unet_causal_3d_blocks import UpsampleCausal3D
+
+try:
+    from xformers.ops.fmha import (
+        Context,
+        Inputs,
+        _memory_efficient_attention_backward,
+        _memory_efficient_attention_forward_requires_grad,
+    )
+
+    HAS_XFORMERS = True
+except ImportError:
+    HAS_XFORMERS = False
+
+SEQ_ALIGN = 32
+SEQ_LIMIT = 16 * 1024
+
+
+def align_atten_bias(attn_bias):
+    B, N, S, S = attn_bias.shape
+    align_size = 8
+    if S % align_size != 0:
+        expand_S = (S // align_size + 1) * align_size
+        new_shape = [B, N, S, expand_S]
+        attn_bias = torch.empty(new_shape, dtype=attn_bias.dtype, device=attn_bias.device)[:, :, :, :S].copy_(attn_bias)
+    return attn_bias
+
+
+def _attn_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+):
+    attn_bias = align_atten_bias(attn_bias)
+    inp = Inputs(q, k, v, attn_bias, p=0, scale=scale, is_partial=False)
+    out, ctx = _memory_efficient_attention_forward_requires_grad(inp, None)
+
+    S = attn_bias.shape[-2]
+    if ctx.lse.shape[-1] != S:
+        ctx.lse = ctx.lse[:, :, :S]
+    return out, ctx.lse, ctx.rng_state
+
+
+def _attn_bwd(
+    grad: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    rng_state: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+):
+    attn_bias = align_atten_bias(attn_bias)
+    inp = Inputs(q, k, v, attn_bias, p=0, scale=scale, output_dtype=q.dtype, is_partial=False)
+    ctx = Context(lse, out, rng_state=rng_state)
+    grads = _memory_efficient_attention_backward(ctx, inp, grad, None)
+    return grads.dq, grads.dk, grads.dv
+
+
+class MemEfficientRingAttention(torch.autograd.Function):
+    ATTN_DONE: torch.cuda.Event = None
+    SP_STREAM: torch.cuda.Stream = None
+
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        sp_group: dist.ProcessGroup,
+        sp_stream: torch.cuda.Stream,
+        softmax_scale: Optional[float] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Ring attention forward
+
+        Args:
+            ctx (_type_): self
+            q (torch.Tensor): shape [B, S/P, N, D]
+            k (torch.Tensor): shape [B, S/P, N, D]
+            v (torch.Tensor): shape [B, S/P, N, D]
+            sp_group (dist.ProcessGroup): sequence parallel group
+            sp_stream (torch.cuda.Stream): sequence parallel stream
+            softmax_scale (Optional[float], optional): softmax scale. Defaults to None.
+            attn_mask (Optional[torch.Tensor], optional): attention mask shape [B, N, S/P, S]. Defaults to None.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: output and log sum exp. Output's shape should be [B, S/P, N, D]. LSE's shape should be [B, N, S/P].
+        """
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        sp_size = dist.get_world_size(sp_group)
+        sp_rank = dist.get_rank(sp_group)
+        kv_comms: List[RingComm] = [RingComm(sp_group) for _ in range(2)]
+        block_attn_masks = [None] * sp_size
+        if attn_mask is not None:
+            # if attn_mask is splitted, uncomment the following line
+            # attn_mask = attn_mask.chunk(sp_size, dim=2)[sp_rank]
+            block_attn_masks = attn_mask.chunk(sp_size, dim=-1)
+
+        # [B, S, N, D]
+        q, k, v = [x.contiguous() for x in [q, k, v]]
+        # Pre-allocate double buffer for overlapping and receiving next step's inputs
+        kv_buffers = [torch.stack((k, v))]  # (2, B, S, N, D)
+        kv_buffers.append(torch.empty_like(kv_buffers[0]))
+        # outputs
+        out = None
+        block_out = [None, None]
+        softmax_lse = [None, None]
+        block_softmax_lse = [None, None]  # log sum exp, the denominator of softmax in attention
+        rng_states = [None for _ in range(sp_size)]
+        sp_streams = [torch.cuda.current_stream(), sp_stream]
+
+        def _kv_comm(i):
+            # Avoid overwriting attn input when it shares mem with buffer
+            if not MemEfficientRingAttention.ATTN_DONE.query():
+                kv_buffers[(i + 1) % 2] = torch.empty_like(kv_buffers[i % 2])
+            if i < sp_size - 1:
+                kv_comms[i % 2].send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
+
+        block_idx = sp_rank
+        for i in range(sp_size):
+            with torch.cuda.stream(sp_streams[i % 2]):
+                # Wait for current kv from prev rank
+                # NOTE: waiting outside the current stream will NOT correctly synchronize.
+                if i == 0:
+                    _kv_comm(i)
+                else:
+                    kv_comms[(i + 1) % 2].wait()
+                kv_block = kv_buffers[i % 2]
+                q_block = q
+                block_out[i % 2], block_softmax_lse[i % 2], rng_states[i] = _attn_fwd(
+                    q_block, kv_block[0], kv_block[1], attn_bias=block_attn_masks[block_idx], scale=softmax_scale
+                )
+                MemEfficientRingAttention.ATTN_DONE.record()
+                # Pipeline the next KV comm with output correction instead of the next flash attn
+                # to minimize idle time when comm takes longer than attn.
+                _kv_comm(i + 1)
+                block_softmax_lse[i % 2] = (
+                    block_softmax_lse[i % 2].transpose(1, 2).unsqueeze(-1).contiguous().float()
+                )  # [B, N, S] -> [B, S, N, 1]
+                assert (
+                    block_out[i % 2].shape[:-1] == block_softmax_lse[i % 2].shape[:-1]
+                ), f"{block_out[i % 2].shape} != {block_softmax_lse[i % 2].shape}"
+                # Output and log sum exp correction. Ideally overlap this with the next flash attn kernel.
+                # In reality this always finishes before next flash attn; no need for extra sync.
+                if i == 0:
+                    out = block_out[0]
+                    softmax_lse = block_softmax_lse[0]
+                else:
+                    out, softmax_lse = _rescale_out_lse(out, block_out[i % 2], softmax_lse, block_softmax_lse[i % 2])
+                block_idx = (block_idx - 1) % sp_size
+        torch.cuda.current_stream().wait_stream(sp_stream)
+        out = out.to(q.dtype)
+        softmax_lse = softmax_lse.squeeze(-1).transpose(1, 2).contiguous()
+
+        ctx.softmax_scale = softmax_scale
+        ctx.block_attn_masks = block_attn_masks
+        ctx.sp_group = sp_group
+        ctx.save_for_backward(q, k, v, out, softmax_lse, *rng_states)  # lse [B, N, S]
+        return out, softmax_lse
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_softmax_lse):
+        # q, k, v, out: [B, S, N, D], softmax_lse: [B, N, S]
+        q, k, v, out, softmax_lse, *rng_states = ctx.saved_tensors
+
+        sp_group = ctx.sp_group
+        sp_size = dist.get_world_size(sp_group)
+        kv_comm = RingComm(sp_group)
+        dkv_comm = RingComm(sp_group)
+
+        grad_output = grad_output.contiguous()
+        kv_buffers = [torch.stack((k, v))]  # (2, B, S, N, D)
+        kv_buffers.append(torch.empty_like(kv_buffers[0]))
+        dq = None
+        dkv_buffers = [torch.empty_like(kv, dtype=torch.float) for kv in kv_buffers]
+        del k, v
+
+        block_idx = dist.get_rank(sp_group)
+        for i in range(sp_size):
+            if i > 0:
+                kv_comm.wait()
+            if i < sp_size - 1:
+                kv_comm.send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
+
+            k_block, v_block = kv_buffers[i % 2]
+            dq_block, dk_block, dv_block = _context_chunk_attn_bwd(
+                grad_output,
+                q,
+                k_block,
+                v_block,
+                out,
+                softmax_lse,
+                rng_states[i],
+                attn_bias=ctx.block_attn_masks[block_idx],
+                scale=ctx.softmax_scale,
+            )
+
+            if i == 0:
+                dq = dq_block.float()
+                dkv_buffers[i % 2][0] = dk_block.float()
+                dkv_buffers[i % 2][1] = dv_block.float()
+            else:
+                dq += dq_block
+                dkv_comm.wait()
+                dkv_buffers[i % 2][0] += dk_block
+                dkv_buffers[i % 2][1] += dv_block
+            dkv_comm.send_recv(dkv_buffers[i % 2], dkv_buffers[(i + 1) % 2])
+            block_idx = (block_idx - 1) % sp_size
+        dkv_comm.wait()
+        dkv = dkv_buffers[sp_size % 2]
+
+        dq, dk, dv = [x.to(q.dtype) for x in (dq, *dkv)]
+
+        torch.cuda.empty_cache()
+        return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None
+
+    @staticmethod
+    def attention(
+        q,
+        k,
+        v,
+        sp_group,
+        softmax_scale: Optional[float] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        return_softmax: bool = False,
+    ):
+        """Ring attention
+
+        Args:
+            q (torch.Tensor): shape [B, S, N, D]
+            k (torch.Tensor): shape [B, S, N, D]
+            v (torch.Tensor): shape [B, S, N, D]
+            sp_group (dist.ProcessGroup): sequence parallel group
+            softmax_scale (Optional[float], optional): softmax scale. Defaults to None.
+            attn_mask (Optional[torch.Tensor], optional): attention mask. Defaults to None.
+            return_softmax (bool, optional): return softmax or not. Defaults to False.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: output and log sum exp. Output's shape should be [B, S, N, D]. LSE's shape should be [B, N, S].
+        """
+        if MemEfficientRingAttention.ATTN_DONE is None:
+            MemEfficientRingAttention.ATTN_DONE = torch.cuda.Event()
+        if MemEfficientRingAttention.SP_STREAM is None:
+            MemEfficientRingAttention.SP_STREAM = torch.cuda.Stream()
+        out, softmax_lse = MemEfficientRingAttention.apply(
+            q, k, v, sp_group, MemEfficientRingAttention.SP_STREAM, softmax_scale, attn_mask
+        )
+        if return_softmax:
+            return out, softmax_lse
+        return out
+
+
+class MemEfficientRingAttnProcessor:
+    def __init__(self, sp_group: dist.ProcessGroup):
+        self.sp_group = sp_group
+        if not HAS_XFORMERS:
+            raise ImportError("MemEfficientRingAttnProcessor requires xformers, to use it, please install xformers.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        sp_group = self.sp_group
+        assert sp_group is not None, "sp_group must be provided for MemEfficientRingAttnProcessor"
+
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim)
+        value = value.view(batch_size, -1, attn.heads, head_dim)
+
+        assert (
+            query.shape[1] % dist.get_world_size(sp_group) == 0
+        ), f"sequence length ({query.shape[1]}) must be divisible by sp_group size ({dist.get_world_size(sp_group)})"
+
+        hidden_states = MemEfficientRingAttention.attention(query, key, value, sp_group, attn_mask=attention_mask)
+
+        hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class ContextParallelAttention:
+    def __init__(self):
+        raise ImportError(f"ContextParallelAttention should not be initialized directly.")
+
+    @staticmethod
+    def from_native_module(module: Attention, process_group, *args, **kwargs) -> Attention:
+        """
+        Convert a native RMSNorm module to colossalai layer norm module,
+        and optionally mark parameters for gradient aggregation.
+
+        Args:
+            module (nn.Module): The native RMSNorm module to be converted.
+            sp_partial_derived (bool): Whether this module's gradients are partially derived in sequence parallelism.
+
+        Returns:
+            nn.Module: The RMSNorm module.
+        """
+
+        # Since gradients are computed using only a subset of the data,
+        # aggregation of these gradients is necessary during backpropagation.
+        # Therefore, we annotate these parameters in advance to indicate the need for gradient aggregation.
+        SeqParallelUtils.marked_as_sp_partial_derived_param(module.to_q.weight)
+        SeqParallelUtils.marked_as_sp_partial_derived_param(module.to_k.weight)
+        SeqParallelUtils.marked_as_sp_partial_derived_param(module.to_v.weight)
+
+        if module.to_q.bias is not None:
+            SeqParallelUtils.marked_as_sp_partial_derived_param(module.to_q.bias)
+            SeqParallelUtils.marked_as_sp_partial_derived_param(module.to_k.bias)
+            SeqParallelUtils.marked_as_sp_partial_derived_param(module.to_v.bias)
+
+        module.set_processor(MemEfficientRingAttnProcessor(process_group))
+
+        return module
+
+
+def _context_chunk_attn_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    attn_bias: Optional[torch.Tensor],
+    scale: Optional[float],
+    seq_align: int = SEQ_ALIGN,
+    seq_limit: int = SEQ_LIMIT,
+):
+    seq_len = q.shape[1]
+    n_chunks = get_conv3d_n_chunks(seq_len, seq_align, seq_limit)
+    q_chunks, k_chunks, v_chunks = q.chunk(n_chunks, dim=1), k.chunk(n_chunks, dim=1), v.chunk(n_chunks, dim=1)
+    attn_bias_chunks = attn_bias.chunk(n_chunks, dim=2) if attn_bias is not None else [None] * n_chunks
+    out_chunks = []
+    lse_chunks = []
+    rng_states = []
+    for q_chunk, attn_bias_chunk in zip(q_chunks, attn_bias_chunks):
+        inner_attn_bias_chunks = (
+            attn_bias_chunk.chunk(n_chunks, dim=3) if attn_bias_chunk is not None else [None] * n_chunks
+        )
+        out_chunk = None
+        for k_chunk, v_chunk, inner_attn_bias_chunk in zip(k_chunks, v_chunks, inner_attn_bias_chunks):
+            block_out, block_lse, rng_state = _attn_fwd(q_chunk, k_chunk, v_chunk, inner_attn_bias_chunk, scale)
+            block_lse = block_lse.transpose(1, 2).unsqueeze(-1).contiguous().float()  # [B, N, S] -> [B, S, N, 1]
+            rng_states.append(rng_state)
+            if out_chunk is None:
+                out_chunk = block_out
+                lse_chunk = block_lse
+            else:
+                out_chunk, lse_chunk = _rescale_out_lse(out_chunk, block_out, lse_chunk, block_lse)
+            lse_chunk = lse_chunk.squeeze(-1).transpose(1, 2).contiguous()  # [B, S, N, 1] -> [B, N, S]
+        out_chunks.append(out_chunk)
+        lse_chunks.append(lse_chunk)
+    out = torch.cat(out_chunks, dim=1)
+    lse = torch.cat(lse_chunks, dim=-1)
+    return out, lse, rng_states
+
+
+def _context_chunk_attn_bwd(
+    grad: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    rng_states: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    scale: Optional[float] = None,
+    seq_align: int = SEQ_ALIGN,
+    seq_limit: int = SEQ_LIMIT,
+    fast_accum: bool = False,
+):
+    seq_len = q.shape[1]
+    n_chunks = get_conv3d_n_chunks(seq_len, seq_align, seq_limit)
+    if n_chunks == 1:
+        return _attn_bwd(grad, q, k, v, out, lse, rng_states, attn_bias, scale)
+
+    q_chunks, k_chunks, v_chunks = q.chunk(n_chunks, dim=1), k.chunk(n_chunks, dim=1), v.chunk(n_chunks, dim=1)
+    attn_bias_chunks = attn_bias.chunk(n_chunks, dim=2) if attn_bias is not None else [None] * n_chunks
+    out_chunks = out.chunk(n_chunks, dim=1)
+    dout_chunks = grad.chunk(n_chunks, dim=1)
+    lse_chunks = lse.chunk(n_chunks, dim=-1)
+    if rng_states is None:
+        rng_states = [None] * (n_chunks * n_chunks)
+
+    i = 0
+
+    acc_dtype = q.dtype if fast_accum else torch.float
+
+    dq = torch.zeros_like(q, dtype=acc_dtype)
+    dk = torch.zeros_like(k, dtype=acc_dtype)
+    dv = torch.zeros_like(v, dtype=acc_dtype)
+
+    dq_chunks = dq.chunk(n_chunks, dim=1)
+    dk_chunks = dk.chunk(n_chunks, dim=1)
+    dv_chunks = dv.chunk(n_chunks, dim=1)
+
+    for q_idx in range(n_chunks):
+        q_chunk = q_chunks[q_idx]
+        attn_bias_chunk = attn_bias_chunks[q_idx]
+        inner_attn_bias_chunks = (
+            attn_bias_chunk.chunk(n_chunks, dim=3) if attn_bias_chunk is not None else [None] * n_chunks
+        )
+        out_chunk = out_chunks[q_idx]
+        dout_chunk = dout_chunks[q_idx]
+        lse_chunk = lse_chunks[q_idx]
+        dq_acc = dq_chunks[q_idx]
+
+        for kv_idx in range(n_chunks):
+            k_chunk = k_chunks[kv_idx]
+            v_chunk = v_chunks[kv_idx]
+            inner_attn_bias_chunk = inner_attn_bias_chunks[kv_idx]
+            dk_acc = dk_chunks[kv_idx]
+            dv_acc = dv_chunks[kv_idx]
+
+            block_dq, block_dk, block_dv = _attn_bwd(
+                dout_chunk, q_chunk, k_chunk, v_chunk, out_chunk, lse_chunk, rng_states[i], inner_attn_bias_chunk, scale
+            )
+
+            dq_acc += block_dq
+            dk_acc += block_dk
+            dv_acc += block_dv
+            i += 1
+
+    return dq.to(q.dtype), dk.to(k.dtype), dv.to(v.dtype)
+
+
+def prepare_parallel_causal_attention_mask(
+    parallel_rank: int, parallel_size: int, n_frame: int, n_hw: int, dtype, device, batch_size: int = None
+):
+    seq_len = n_frame * n_hw
+    assert seq_len % parallel_size == 0, f"seq_len {seq_len} must be divisible by parallel_size {parallel_size}"
+    local_seq_len = seq_len // parallel_size
+    local_seq_start = local_seq_len * parallel_rank
+    if dtype is torch.bfloat16:
+        # A trick to avoid nan of memory efficient attention, maybe introduce some bias
+        fmin = torch.finfo(torch.float16).min
+    else:
+        fmin = torch.finfo(dtype).min
+    mask = torch.full((local_seq_len, seq_len), fmin, dtype=dtype, device=device)
+    for i in range(local_seq_len):
+        i_frame = (i + local_seq_start) // n_hw
+        mask[i, : (i_frame + 1) * n_hw] = 0
+    if batch_size is not None:
+        mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+    return mask
+
+
+def prepare_parallel_attention_mask(
+    self, hidden_states: torch.Tensor, cp_group: dist.ProcessGroup = None
+) -> torch.Tensor:
+    B, C, T, H, W = hidden_states.shape
+    attention_mask = prepare_parallel_causal_attention_mask(
+        dist.get_rank(cp_group),
+        dist.get_world_size(cp_group),
+        T,
+        H * W,
+        hidden_states.dtype,
+        hidden_states.device,
+        batch_size=B,
+    )
+    return attention_mask
+
+
+class TPUpDecoderBlockCausal3D(UpsampleCausal3D):
+    def __init__(
+        self,
+        channels,
+        out_channels=None,
+        kernel_size=3,
+        bias=True,
+        upsample_factor=(2, 2, 2),
+        tp_group=None,
+        split_input: bool = False,
+        split_output: bool = False,
+        conv_=None,
+        shortcut_=None,
+    ):
+        assert tp_group is not None, "tp_group must be provided"
+        super().__init__(channels, out_channels, kernel_size, bias, upsample_factor)
+        conv = conv_ if conv_ is not None else self.conv.conv
+        self.conv.conv = Conv3dTPRow.from_native_module(
+            conv, tp_group, split_input=split_input, split_output=split_output
+        )
+        self.tp_group = tp_group
+        tp_size = dist.get_world_size(group=self.tp_group)
+        assert self.channels % tp_size == 0, f"channels {self.channels} must be divisible by tp_size {tp_size}"
+        self.channels = self.channels // tp_size
+
+    def forward(self, input_tensor):
+        input_tensor = split_forward_gather_backward(input_tensor, 1, self.tp_group)
+        return super().forward(input_tensor)
+
+    def from_native_module(module: UpsampleCausal3D, process_group, **kwargs):
+        conv = module.conv.conv
+        return TPUpDecoderBlockCausal3D(
+            module.channels,
+            module.out_channels,
+            conv.kernel_size[0],
+            conv.bias is not None,
+            module.upsample_factor,
+            conv_=conv,
+            shortcut_=getattr(module, "shortcut", None),
+            tp_group=process_group,
+            **kwargs,
+        )
diff --git a/opensora/models/hunyuan_vae/policy.py b/opensora/models/hunyuan_vae/policy.py
new file mode 100644
index 0000000..bfaf8e4
--- /dev/null
+++ b/opensora/models/hunyuan_vae/policy.py
@@ -0,0 +1,155 @@
+from functools import partial
+from typing import Dict, Union
+
+import torch.nn as nn
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+from opensora.models.vae.tensor_parallel import Conv3dTPCol, Conv3dTPRow, GroupNormTP
+
+from .distributed import ContextParallelAttention, TPUpDecoderBlockCausal3D, prepare_parallel_attention_mask
+from .vae import DecoderCausal3D, EncoderCausal3D
+
+
+def gen_resnets_replacements(prefix: str, with_shortcut: bool = False):
+    replacements = [
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.norm1",
+            target_module=GroupNormTP,
+        ),
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.conv1.conv",
+            target_module=Conv3dTPRow,
+            kwargs=dict(
+                split_output=True,
+            ),
+        ),
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.norm2",
+            target_module=GroupNormTP,
+        ),
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.conv2.conv",
+            target_module=Conv3dTPRow,
+            kwargs=dict(
+                split_output=True,
+            ),
+        ),
+    ]
+    if with_shortcut:
+        replacements.append(
+            SubModuleReplacementDescription(
+                suffix=f"{prefix}.conv_shortcut.conv",
+                target_module=Conv3dTPRow,
+                kwargs=dict(
+                    split_output=True,
+                ),
+            )
+        )
+    return replacements
+
+
+class HunyuanVaePolicy(Policy):
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        policy = {}
+
+        policy[EncoderCausal3D] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="conv_in.conv",
+                    target_module=Conv3dTPCol,
+                ),
+                *gen_resnets_replacements("down_blocks[0].resnets[0]"),
+                *gen_resnets_replacements("down_blocks[0].resnets[1]"),
+                SubModuleReplacementDescription(
+                    suffix="down_blocks[0].downsamplers[0].conv.conv",
+                    target_module=Conv3dTPRow,
+                    kwargs=dict(
+                        split_output=True,
+                    ),
+                ),
+                *gen_resnets_replacements("down_blocks[1].resnets[0]", with_shortcut=True),
+                *gen_resnets_replacements("down_blocks[1].resnets[1]"),
+                SubModuleReplacementDescription(
+                    suffix="down_blocks[1].downsamplers[0].conv.conv",
+                    target_module=Conv3dTPRow,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="mid_block.attentions[0]",
+                    target_module=ContextParallelAttention,
+                ),
+            ],
+            attribute_replacement={
+                "down_blocks[0].downsamplers[0].channels": self.model.encoder.down_blocks[0].downsamplers[0].channels
+                // self.shard_config.tensor_parallel_size,
+                "down_blocks[1].downsamplers[0].channels": self.model.encoder.down_blocks[1].downsamplers[0].channels
+                // self.shard_config.tensor_parallel_size,
+                # "mid_block.attentions[0].processor": MemEfficientRingAttnProcessor(
+                #     self.shard_config.tensor_parallel_process_group
+                # ),
+            },
+            method_replacement={
+                "prepare_attention_mask": partial(
+                    prepare_parallel_attention_mask, cp_group=self.shard_config.tensor_parallel_process_group
+                ),
+            },
+        )
+
+        policy[DecoderCausal3D] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="up_blocks[1].upsamplers[0]",
+                    target_module=TPUpDecoderBlockCausal3D,
+                    kwargs=dict(
+                        split_output=True,
+                    ),
+                ),
+                *gen_resnets_replacements("up_blocks[2].resnets[0]", with_shortcut=True),
+                *gen_resnets_replacements("up_blocks[2].resnets[1]"),
+                *gen_resnets_replacements("up_blocks[2].resnets[2]"),
+                SubModuleReplacementDescription(
+                    suffix="up_blocks[2].upsamplers[0].conv.conv",
+                    target_module=Conv3dTPRow,
+                    kwargs=dict(
+                        split_output=True,
+                    ),
+                ),
+                *gen_resnets_replacements("up_blocks[3].resnets[0]", with_shortcut=True),
+                *gen_resnets_replacements("up_blocks[3].resnets[1]"),
+                *gen_resnets_replacements("up_blocks[3].resnets[2]"),
+                SubModuleReplacementDescription(
+                    suffix="conv_norm_out",
+                    target_module=GroupNormTP,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="conv_out.conv",
+                    target_module=Conv3dTPRow,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="mid_block.attentions[0]",
+                    target_module=ContextParallelAttention,
+                ),
+            ],
+            attribute_replacement={
+                "up_blocks[2].upsamplers[0].channels": self.model.decoder.up_blocks[2].upsamplers[0].channels
+                // self.shard_config.tensor_parallel_size,
+                # "mid_block.attentions[0].processor": MemEfficientRingAttnProcessor(
+                #     self.shard_config.tensor_parallel_process_group
+                # ),
+            },
+            method_replacement={
+                "prepare_attention_mask": partial(
+                    prepare_parallel_attention_mask, cp_group=self.shard_config.tensor_parallel_process_group
+                ),
+            },
+        )
+
+        return policy
+
+    def postprocess(self):
+        return self.model
diff --git a/opensora/models/hunyuan_vae/unet_causal_3d_blocks.py b/opensora/models/hunyuan_vae/unet_causal_3d_blocks.py
new file mode 100644
index 0000000..5781d18
--- /dev/null
+++ b/opensora/models/hunyuan_vae/unet_causal_3d_blocks.py
@@ -0,0 +1,476 @@
+# Modified from diffusers==0.29.2 and HunyuanVideo
+# 
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# # 
+# Copyright 2024 HunyuanVideo
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import Attention
+from diffusers.utils import logging
+from einops import rearrange
+from torch import nn
+
+from opensora.acceleration.checkpoint import auto_grad_checkpoint
+from opensora.models.vae.utils import ChannelChunkConv3d, get_conv3d_n_chunks
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+INTERPOLATE_NUMEL_LIMIT = 2**31 - 1
+
+
+def chunk_nearest_interpolate(
+    x: torch.Tensor,
+    scale_factor,
+):
+    limit = INTERPOLATE_NUMEL_LIMIT // np.prod(scale_factor)
+    n_chunks = get_conv3d_n_chunks(x.numel(), x.size(1), limit)
+    x_chunks = x.chunk(n_chunks, dim=1)
+    x_chunks = [F.interpolate(x_chunk, scale_factor=scale_factor, mode="nearest") for x_chunk in x_chunks]
+    return torch.cat(x_chunks, dim=1)
+
+
+def prepare_causal_attention_mask(n_frame: int, n_hw: int, dtype, device, batch_size: int = None):
+    seq_len = n_frame * n_hw
+    mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+    for i in range(seq_len):
+        i_frame = i // n_hw
+        mask[i, : (i_frame + 1) * n_hw] = 0
+    if batch_size is not None:
+        mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+    return mask
+
+
+class CausalConv3d(nn.Module):
+    """
+    Implements a causal 3D convolution layer where each position only depends on previous timesteps and current spatial locations.
+    This maintains temporal causality in video generation tasks.
+    """
+
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        pad_mode="replicate",
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.pad_mode = pad_mode
+        padding = (
+            kernel_size // 2,
+            kernel_size // 2,
+            kernel_size // 2,
+            kernel_size // 2,
+            kernel_size - 1,
+            0,
+        )  # W, H, T
+        self.time_causal_padding = padding
+
+        self.conv = ChannelChunkConv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)
+
+    def forward(self, x):
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        return self.conv(x)
+
+class UpsampleCausal3D(nn.Module):
+    """
+    A 3D upsampling layer with an optional convolution.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        kernel_size: int = 3,
+        bias=True,
+        upsample_factor=(2, 2, 2),
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.upsample_factor = upsample_factor
+        self.conv = CausalConv3d(self.channels, self.out_channels, kernel_size=kernel_size, bias=bias)
+
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        assert input_tensor.shape[1] == self.channels
+
+        #######################
+        # handle hidden states
+        #######################
+        hidden_states = input_tensor
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # dtype = hidden_states.dtype
+        # if dtype == torch.bfloat16:
+        #     hidden_states = hidden_states.to(torch.float32)
+
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+
+        # interpolate H & W only for the first frame; interpolate T & H & W for the rest
+        T = hidden_states.size(2)
+        first_h, other_h = hidden_states.split((1, T - 1), dim=2)
+        # process non-1st frames
+        if T > 1:
+            other_h = chunk_nearest_interpolate(other_h, scale_factor=self.upsample_factor)
+        # proess 1st fram
+        first_h = first_h.squeeze(2)
+        first_h = chunk_nearest_interpolate(first_h, scale_factor=self.upsample_factor[1:])
+        first_h = first_h.unsqueeze(2)
+        # concat together
+        if T > 1:
+            hidden_states = torch.cat((first_h, other_h), dim=2)
+        else:
+            hidden_states = first_h
+
+        # If the input is bfloat16, we cast back to bfloat16
+        # if dtype == torch.bfloat16:
+        #     hidden_states = hidden_states.to(dtype)
+
+        hidden_states = self.conv(hidden_states)
+
+        return hidden_states
+
+class DownsampleCausal3D(nn.Module):
+    """
+    A 3D downsampling layer with an optional convolution.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        kernel_size=3,
+        bias=True,
+        stride=2,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = channels
+        self.conv = CausalConv3d(self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, bias=bias)
+
+    def forward(self, input_tensor: torch.FloatTensor) -> torch.FloatTensor:
+        assert input_tensor.shape[1] == self.channels
+        hidden_states = self.conv(input_tensor)
+
+        return hidden_states
+
+
+class ResnetBlockCausal3D(nn.Module):
+    r"""
+    A Resnet block.
+    """
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        conv_shortcut_bias: bool = True,
+        conv_3d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1)
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_3d_out_channels = conv_3d_out_channels or out_channels
+        self.conv2 = CausalConv3d(out_channels, conv_3d_out_channels, kernel_size=3, stride=1)
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = self.downsample = None
+
+        self.use_in_shortcut = self.in_channels != conv_3d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = CausalConv3d(
+                in_channels,
+                conv_3d_out_channels,
+                kernel_size=1,
+                stride=1,
+                bias=conv_shortcut_bias,
+            )
+
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
+
+
+class UNetMidBlockCausal3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlockCausal3D`] with multiple residual blocks and optional attention blocks.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockCausal3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, attention_mask: Optional[torch.Tensor]) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                B, C, T, H, W = hidden_states.shape
+                hidden_states = rearrange(hidden_states, "b c f h w -> b (f h w) c")
+                hidden_states = attn(hidden_states, attention_mask=attention_mask)
+                hidden_states = rearrange(hidden_states, "b (f h w) c -> b c f h w", f=T, h=H, w=W)
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states
+
+
+class DownEncoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_stride: int = 2,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    DownsampleCausal3D(
+                        out_channels,
+                        stride=downsample_stride,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = auto_grad_checkpoint(resnet, hidden_states)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = auto_grad_checkpoint(downsampler, hidden_states)
+
+        return hidden_states
+
+
+class UpDecoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        upsample_scale_factor=(2, 2, 2),
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    UpsampleCausal3D(
+                        out_channels,
+                        out_channels=out_channels,
+                        upsample_factor=upsample_scale_factor,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = auto_grad_checkpoint(resnet, hidden_states)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = auto_grad_checkpoint(upsampler, hidden_states)
+
+        return hidden_states
diff --git a/opensora/models/hunyuan_vae/vae.py b/opensora/models/hunyuan_vae/vae.py
new file mode 100644
index 0000000..1bef8c7
--- /dev/null
+++ b/opensora/models/hunyuan_vae/vae.py
@@ -0,0 +1,340 @@
+# Modified from HunyuanVideo
+# 
+# Copyright 2024 HunyuanVideo
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+
+from opensora.acceleration.checkpoint import auto_grad_checkpoint, checkpoint
+from opensora.models.hunyuan_vae.unet_causal_3d_blocks import (
+    CausalConv3d,
+    DownEncoderBlockCausal3D,
+    UNetMidBlockCausal3D,
+    UpDecoderBlockCausal3D,
+    prepare_causal_attention_mask,
+)
+
+
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class EncoderCausal3D(nn.Module):
+    r"""
+    The `EncoderCausal3D` layer of a variational autoencoder that encodes its input into a latent representation.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, _ in enumerate(block_out_channels):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_downsample_layers = int(np.log2(time_compression_ratio))
+
+            if time_compression_ratio == 4:
+                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+                add_time_downsample = bool(
+                    i >= (len(block_out_channels) - 1 - num_time_downsample_layers) and not is_final_block
+                )
+            elif time_compression_ratio == 8:
+                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+                add_time_downsample = bool(i < num_spatial_downsample_layers)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+
+            downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
+            downsample_stride_T = (2,) if add_time_downsample else (1,)
+            downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
+            down_block = DownEncoderBlockCausal3D(
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                dropout=dropout,
+                add_downsample=bool(add_spatial_downsample or add_time_downsample),
+                downsample_stride=downsample_stride,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+            )
+
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            add_attention=mid_block_add_attention,
+        )
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
+
+    def prepare_attention_mask(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        B, C, T, H, W = hidden_states.shape
+        attention_mask = prepare_causal_attention_mask(
+            T, H * W, hidden_states.dtype, hidden_states.device, batch_size=B
+        )
+        return attention_mask
+
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `EncoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+
+        sample = self.conv_in(sample)
+
+        # down
+        for down_block in self.down_blocks:
+            sample = down_block(sample)
+
+        # middle
+        if self.mid_block.add_attention:
+            attention_mask = self.prepare_attention_mask(sample)
+        else:
+            attention_mask = None
+        sample = auto_grad_checkpoint(self.mid_block, sample, attention_mask)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class DecoderCausal3D(nn.Module):
+    r"""
+    The `DecoderCausal3D` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            add_attention=mid_block_add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, _ in enumerate(block_out_channels):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_upsample_layers = int(np.log2(time_compression_ratio))
+
+            if time_compression_ratio == 4:
+                add_spatial_upsample = bool(i < num_spatial_upsample_layers)
+                add_time_upsample = bool(
+                    i >= len(block_out_channels) - 1 - num_time_upsample_layers and not is_final_block
+                )
+            elif time_compression_ratio == 8:
+                add_spatial_upsample = bool(i < num_spatial_upsample_layers)
+                add_time_upsample = bool(i < num_spatial_upsample_layers)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+
+            upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
+            upsample_scale_factor_T = (2,) if add_time_upsample else (1,)
+            upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
+            up_block = UpDecoderBlockCausal3D(
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                resolution_idx=None,
+                dropout=dropout,
+                add_upsample=bool(add_spatial_upsample or add_time_upsample),
+                upsample_scale_factor=upsample_scale_factor,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+            )
+
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
+
+    def post_process(self, sample: torch.Tensor) -> torch.Tensor:
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        return sample
+
+    def prepare_attention_mask(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        B, C, T, H, W = hidden_states.shape
+        attention_mask = prepare_causal_attention_mask(
+            T, H * W, hidden_states.dtype, hidden_states.device, batch_size=B
+        )
+        return attention_mask
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `DecoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions."
+
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+
+        # middle
+        if self.mid_block.add_attention:
+            attention_mask = self.prepare_attention_mask(sample)
+        else:
+            attention_mask = None
+
+        sample = auto_grad_checkpoint(self.mid_block, sample, attention_mask)
+        sample = sample.to(upscale_dtype)
+
+        # up
+        for up_block in self.up_blocks:
+            sample = up_block(sample)
+
+        # post-process
+        if getattr(self, "grad_checkpointing", False):
+            sample = checkpoint(self.post_process, sample, use_reentrant=True)
+        else:
+            sample = self.post_process(sample)
+
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2  # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1  # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=reduce_dim,
+                )
+
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
diff --git a/opensora/models/mmdit/__init__.py b/opensora/models/mmdit/__init__.py
new file mode 100644
index 0000000..c505ed9
--- /dev/null
+++ b/opensora/models/mmdit/__init__.py
@@ -0,0 +1 @@
+from .model import Flux
diff --git a/opensora/models/mmdit/distributed.py b/opensora/models/mmdit/distributed.py
new file mode 100644
index 0000000..9f5288a
--- /dev/null
+++ b/opensora/models/mmdit/distributed.py
@@ -0,0 +1,883 @@
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from colossalai.shardformer.layer import (FusedLinear1D_Col, FusedLinear1D_Row,
+                                          Linear1D_Col, Linear1D_Row)
+from colossalai.shardformer.layer._operation import all_to_all_comm
+from colossalai.shardformer.layer.attn import RingComm, _rescale_out_lse
+from colossalai.shardformer.layer.utils import is_share_sp_tp
+from colossalai.shardformer.policies.base_policy import (
+    ModulePolicyDescription, Policy, SubModuleReplacementDescription)
+from colossalai.shardformer.shard import ShardConfig
+from einops import rearrange
+from flash_attn.flash_attn_interface import (_flash_attn_backward,
+                                             _flash_attn_forward)
+from liger_kernel.ops.rope import LigerRopeFunction
+
+try:
+    from flash_attn_interface import \
+        _flash_attn_backward as _flash_attn_backward_v3
+    from flash_attn_interface import \
+        _flash_attn_forward as _flash_attn_forward_v3
+
+    SUPPORT_FA3 = True
+except:
+    SUPPORT_FA3 = False
+
+from torch import Tensor
+
+from opensora.acceleration.checkpoint import auto_grad_checkpoint
+
+from .layers import DoubleStreamBlock, SingleStreamBlock
+from .math import apply_rope, attention
+from .model import MMDiTModel
+
+
+class _SplitForwardGatherBackwardVarLen(torch.autograd.Function):
+    """
+    Split the input and keep only the corresponding chuck to the rank.
+
+    Args:
+        input_ (`torch.Tensor`): input matrix.
+        dim (int): the dimension to perform split and gather
+        process_group (`torch.distributed.ProcessGroup`): the process group used for collective communication
+
+    """
+
+    @staticmethod
+    def forward(ctx, input_, dim, process_group, splits: List[int]):
+        ctx.process_group = process_group
+        ctx.dim = dim
+        rank = dist.get_rank(process_group)
+        ctx.grad_scale = splits[rank] / sum(splits)
+        ctx.splits = splits
+        return torch.split(input_, splits, dim=dim)[rank].clone()
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = grad_output * ctx.grad_scale
+        grad_output = grad_output.contiguous()
+        world_size = dist.get_world_size(ctx.process_group)
+        shapes = [list(grad_output.shape) for _ in range(world_size)]
+        for i, shape in enumerate(shapes):
+            shape[ctx.dim] = ctx.splits[i]
+        tensor_list = [torch.empty(shape, dtype=grad_output.dtype, device=grad_output.device) for shape in shapes]
+        dist.all_gather(tensor_list, grad_output, group=ctx.process_group)
+        return torch.cat(tensor_list, dim=ctx.dim), None, None, None
+
+
+def split_forward_gather_backward_var_len(input_, dim, process_group, splits: List[int]):
+    return _SplitForwardGatherBackwardVarLen.apply(input_, dim, process_group, splits)
+
+
+class _GatherForwardSplitBackwardVarLen(torch.autograd.Function):
+    """
+    Split the input and keep only the corresponding chuck to the rank.
+
+    Args:
+        input_ (`torch.Tensor`): input matrix.
+        dim (int): the dimension to perform split and gather
+        process_group (`torch.distributed.ProcessGroup`): the process group used for collective communication
+
+    """
+
+    @staticmethod
+    def forward(ctx, input_, dim, process_group, splits: List[int]):
+        input_ = input_.contiguous()
+        ctx.process_group = process_group
+        ctx.dim = dim
+        rank = dist.get_rank(process_group)
+
+        ctx.grad_scale = sum(splits) / splits[rank]
+        ctx.splits = splits
+        world_size = dist.get_world_size(ctx.process_group)
+        shapes = [list(input_.shape) for _ in range(world_size)]
+        for i, shape in enumerate(shapes):
+            shape[dim] = splits[i]
+        tensor_list = [torch.empty(shape, dtype=input_.dtype, device=input_.device) for shape in shapes]
+        dist.all_gather(tensor_list, input_, group=ctx.process_group)
+        return torch.cat(tensor_list, dim=dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = grad_output * ctx.grad_scale
+        rank = dist.get_rank(ctx.process_group)
+        return torch.split(grad_output, ctx.splits, dim=ctx.dim)[rank].clone(), None, None, None
+
+
+def gather_forward_split_backward_var_len(input_, dim, process_group, splits: List[int]):
+    return _GatherForwardSplitBackwardVarLen.apply(input_, dim, process_group, splits)
+
+
+def _fa_forward(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, dropout_p: float = 0.0, softmax_scale: Optional[float] = None
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if SUPPORT_FA3:
+        out, softmax_lse, *_ = _flash_attn_forward_v3(
+            q,
+            k,
+            v,
+            None,
+            None,
+            None,
+            None,  # k_new, q_new, qv, out
+            None,
+            None,
+            None,  # cu_seqlens_q, cu_seqlens_k, cu_seqlens_k_new
+            None,
+            None,
+            None,
+            None,  # seqused_q, seqused_k, max_seqlen_q, max_seqlen_k
+            None,
+            None,
+            None,  # page_table, kv_batch_idx, leftpad_k
+            None,
+            None,  # rotary_cos/sin
+            None,
+            None,
+            None,  # q_descale, k_descale, v_descale
+            softmax_scale,
+            False,  # causal
+            (-1, -1),
+        )
+        rng_state = None
+    else:
+        out, softmax_lse, _, rng_state = _flash_attn_forward(
+            q,
+            k,
+            v,
+            dropout_p,
+            softmax_scale,
+            causal=False,
+            window_size_left=-1,
+            window_size_right=-1,
+            softcap=0.0,
+            alibi_slopes=None,
+            return_softmax=False,
+        )
+    return out, softmax_lse, rng_state
+
+
+def _fa_backward(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    dv: torch.Tensor,
+    rng_state: torch.Tensor,
+    dropout_p: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    deterministic: bool = False,
+) -> None:
+    if SUPPORT_FA3:
+        _flash_attn_backward_v3(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            None, None, None, None, None, None,
+            dq,
+            dk,
+            dv,
+            softmax_scale,
+            False,  # causal
+            (-1, -1),
+            deterministic=deterministic,
+        )
+    else:
+        _flash_attn_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dk,
+            dv,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=False,
+            window_size_left=-1,
+            window_size_right=-1,
+            softcap=0.0,
+            alibi_slopes=None,
+            deterministic=deterministic,
+            rng_state=rng_state,
+        )
+
+
+class RingAttention(torch.autograd.Function):
+    ATTN_DONE: torch.cuda.Event = None
+    SP_STREAM: torch.cuda.Stream = None
+
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        sp_group: dist.ProcessGroup,
+        sp_stream: torch.cuda.Stream,
+        dropout_p: float = 0.0,
+        softmax_scale: Optional[float] = None,
+        deterministic: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Ring attention forward
+
+        Args:
+            ctx (_type_): self
+            q (torch.Tensor): shape [B, S, N, D]
+            k (torch.Tensor): shape [B, S, N, D]
+            v (torch.Tensor): shape [B, S, N, D]
+            sp_group (dist.ProcessGroup): sequence parallel group
+            sp_stream (torch.cuda.Stream): sequence parallel stream
+            dropout_p (float, optional): dropout prob. Defaults to 0.0.
+            softmax_scale (Optional[float], optional): softmax scale. Defaults to None.
+            deterministic (Optional[bool], optional): backward deterministic mode. Defaults to False.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: output and log sum exp. Output's shape should be [B, S, N, D]. LSE's shape should be [B, N, S].
+        """
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        sp_size = dist.get_world_size(sp_group)
+        kv_comms: List[RingComm] = [RingComm(sp_group) for _ in range(2)]
+
+        # [B, S, N, D]
+        q, k, v = [x.contiguous() for x in [q, k, v]]
+        # Pre-allocate double buffer for overlapping and receiving next step's inputs
+        kv_buffers = [torch.stack((k, v))]  # (2, B, S, N, D)
+        kv_buffers.append(torch.empty_like(kv_buffers[0]))
+        # outputs
+        out = None
+        block_out = [None, None]
+        softmax_lse = [None, None]
+        block_softmax_lse = [None, None]  # log sum exp, the denominator of softmax in attention
+        rng_states = [None for _ in range(sp_size)]
+        sp_streams = [torch.cuda.current_stream(), sp_stream]
+
+        def _kv_comm(i):
+            # Avoid overwriting attn input when it shares mem with buffer
+            if not RingAttention.ATTN_DONE.query():
+                kv_buffers[(i + 1) % 2] = torch.empty_like(kv_buffers[i % 2])
+            if i < sp_size - 1:
+                kv_comms[i % 2].send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
+
+        for i in range(sp_size):
+            with torch.cuda.stream(sp_streams[i % 2]):
+                # Wait for current kv from prev rank
+                # NOTE: waiting outside the current stream will NOT correctly synchronize.
+                if i == 0:
+                    _kv_comm(i)
+                else:
+                    kv_comms[(i + 1) % 2].wait()
+                kv_block = kv_buffers[i % 2]
+                q_block = q
+                block_out[i % 2], block_softmax_lse[i % 2], rng_states[i] = _fa_forward(
+                    q_block, kv_block[0], kv_block[1], dropout_p, softmax_scale
+                )
+                RingAttention.ATTN_DONE.record()
+                # Pipeline the next KV comm with output correction instead of the next flash attn
+                # to minimize idle time when comm takes longer than attn.
+                _kv_comm(i + 1)
+                block_softmax_lse[i % 2] = (
+                    block_softmax_lse[i % 2].transpose(1, 2).unsqueeze(-1).contiguous().float()
+                )  # [B, N, S] -> [B, S, N, 1]
+                assert block_out[i % 2].shape[:-1] == block_softmax_lse[i % 2].shape[:-1]
+                # Output and log sum exp correction. Ideally overlap this with the next flash attn kernel.
+                # In reality this always finishes before next flash attn; no need for extra sync.
+                if i == 0:
+                    out = block_out[0]
+                    softmax_lse = block_softmax_lse[0]
+                else:
+                    out, softmax_lse = _rescale_out_lse(out, block_out[i % 2], softmax_lse, block_softmax_lse[i % 2])
+        torch.cuda.current_stream().wait_stream(sp_stream)
+        out = out.to(q.dtype)
+        softmax_lse = softmax_lse.squeeze(-1).transpose(1, 2).contiguous()
+
+        ctx.dropout_p = dropout_p
+        ctx.softmax_scale = softmax_scale
+        ctx.deterministic = deterministic
+        ctx.sp_group = sp_group
+        ctx.save_for_backward(q, k, v, out, softmax_lse, *rng_states)  # lse [B, N, S]
+        return out, softmax_lse
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_softmax_lse):
+        # q, k, v, out: [B, S, N, D], softmax_lse: [B, N, S]
+        q, k, v, out, softmax_lse, *rng_states = ctx.saved_tensors
+
+        sp_group = ctx.sp_group
+        sp_size = dist.get_world_size(sp_group)
+        kv_comm = RingComm(sp_group)
+        dkv_comm = RingComm(sp_group)
+
+        grad_output = grad_output.contiguous()
+        kv_buffers = [torch.stack((k, v))]  # (2, B, S, N, D)
+        kv_buffers.append(torch.empty_like(kv_buffers[0]))
+        dq = None
+        dq_block = torch.empty_like(q)
+        dk_block = torch.empty_like(k)
+        dv_block = torch.empty_like(v)
+        dkv_buffers = [torch.empty_like(kv, dtype=torch.float) for kv in kv_buffers]
+        del k, v
+
+        for i in range(sp_size):
+            if i > 0:
+                kv_comm.wait()
+            if i < sp_size - 1:
+                kv_comm.send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
+
+            k_block, v_block = kv_buffers[i % 2]
+            _fa_backward(
+                grad_output,
+                q,
+                k_block,
+                v_block,
+                out,
+                softmax_lse,
+                dq_block,
+                dk_block,
+                dv_block,
+                rng_states[i],
+                dropout_p=ctx.dropout_p,
+                softmax_scale=ctx.softmax_scale,
+                deterministic=ctx.deterministic,
+            )
+
+            if i == 0:
+                dq = dq_block.float()
+                dkv_buffers[i % 2][0] = dk_block.float()
+                dkv_buffers[i % 2][1] = dv_block.float()
+            else:
+                dq += dq_block
+                dkv_comm.wait()
+                dkv_buffers[i % 2][0] += dk_block
+                dkv_buffers[i % 2][1] += dv_block
+            dkv_comm.send_recv(dkv_buffers[i % 2], dkv_buffers[(i + 1) % 2])
+        dkv_comm.wait()
+        dkv = dkv_buffers[sp_size % 2]
+
+        dq, dk, dv = [x.to(q.dtype) for x in (dq, *dkv)]
+
+        return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None, None
+
+    @staticmethod
+    def attention(
+        q,
+        k,
+        v,
+        sp_group,
+        dropout_p: float = 0.0,
+        softmax_scale: Optional[float] = None,
+        deterministic: bool = False,
+        return_softmax: bool = False,
+    ):
+        """Ring attention
+
+        Args:
+            q (torch.Tensor): shape [B, S, N, D]
+            k (torch.Tensor): shape [B, S, N, D]
+            v (torch.Tensor): shape [B, S, N, D]
+            sp_group (dist.ProcessGroup): sequence parallel group
+            dropout_p (float, optional): dropout prob. Defaults to 0.0.
+            softmax_scale (Optional[float], optional): softmax scale. Defaults to None.
+            deterministic (Optional[bool], optional): backward deterministic mode. Defaults to False.
+            return_softmax (bool, optional): return softmax or not. Defaults to False.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: output and log sum exp. Output's shape should be [B, S, N, D]. LSE's shape should be [B, N, S].
+        """
+        if RingAttention.ATTN_DONE is None:
+            RingAttention.ATTN_DONE = torch.cuda.Event()
+        if RingAttention.SP_STREAM is None:
+            RingAttention.SP_STREAM = torch.cuda.Stream()
+        out, softmax_lse = RingAttention.apply(
+            q, k, v, sp_group, RingAttention.SP_STREAM, dropout_p, softmax_scale, deterministic
+        )
+        if return_softmax:
+            return out, softmax_lse
+        return out
+
+
+def ring_attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, sp_group: dist.ProcessGroup) -> Tensor:
+    if isinstance(pe, torch.Tensor):
+        q, k = apply_rope(q, k, pe)
+    else:
+        cos, sin = pe
+        q, k = LigerRopeFunction.apply(q, k, cos, sin)
+    q, k, v = [x.transpose(1, 2) for x in (q, k, v)]  # [B, H, L, D] -> [B, L, H, D]
+    x = RingAttention.attention(q, k, v, sp_group)
+    x = rearrange(x, "B L H D -> B L (H D)")
+    return x
+
+
+class DistributedDoubleStreamBlockProcessor:
+    def __init__(self, shard_config: ShardConfig) -> None:
+        self.shard_config = shard_config
+
+    def __call__(
+        self, attn: DoubleStreamBlock, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = attn.img_mod(vec)
+        txt_mod1, txt_mod2 = attn.txt_mod(vec)
+
+        # prepare image for attention
+        img_modulated = attn.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        if attn.img_attn.fused_qkv:
+            img_qkv = attn.img_attn.qkv(img_modulated)
+            img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        else:
+            img_q = rearrange(attn.img_attn.q_proj(img_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            img_k = rearrange(attn.img_attn.k_proj(img_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            img_v = rearrange(attn.img_attn.v_proj(img_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+        img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
+        if not attn.img_attn.fused_qkv:
+            img_q = rearrange(img_q, "B L H D -> B H L D")
+            img_k = rearrange(img_k, "B L H D -> B H L D")
+            img_v = rearrange(img_v, "B L H D -> B H L D")
+
+        # prepare txt for attention
+        txt_modulated = attn.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        if attn.txt_attn.fused_qkv:
+            txt_qkv = attn.txt_attn.qkv(txt_modulated)
+            txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        else:
+            txt_q = rearrange(attn.txt_attn.q_proj(txt_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            txt_k = rearrange(attn.txt_attn.k_proj(txt_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            txt_v = rearrange(attn.txt_attn.v_proj(txt_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+        txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
+        if not attn.txt_attn.fused_qkv:
+            txt_q = rearrange(txt_q, "B L H D -> B H L D")
+            txt_k = rearrange(txt_k, "B L H D -> B H L D")
+            txt_v = rearrange(txt_v, "B L H D -> B H L D")
+
+        txt_len = txt_q.size(2)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        if (
+            self.shard_config.enable_sequence_parallelism
+            and self.shard_config.sequence_parallelism_mode == "all_to_all"
+        ):
+            assert (
+                attn.num_heads % self.shard_config.sequence_parallel_size == 0
+            ), f"Expected num heads({attn.num_heads}) % sp size({self.shard_config.sequence_parallel_size}) == 0"
+            # TODO: overlap the communication with computation
+            q = all_to_all_comm(q, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2)
+            k = all_to_all_comm(k, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2)
+            v = all_to_all_comm(v, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2)
+
+        if self.shard_config.enable_sequence_parallelism and self.shard_config.sequence_parallelism_mode == "ring_attn":
+            attn1 = ring_attention(q, k, v, pe, self.shard_config.sequence_parallel_process_group)
+        else:
+            attn1 = attention(q, k, v, pe=pe)
+        if (
+            self.shard_config.enable_sequence_parallelism
+            and self.shard_config.sequence_parallelism_mode == "all_to_all"
+        ):
+            attn1 = all_to_all_comm(
+                attn1, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2
+            )
+        txt_attn, img_attn = attn1[:, :txt_len], attn1[:, txt_len:]
+
+        # calculate the img bloks
+        img = img + img_mod1.gate * attn.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
+
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+
+
+class DistributedSingleStreamBlockProcessor:
+    def __init__(self, shard_config: ShardConfig) -> None:
+        self.shard_config = shard_config
+
+    def __call__(self, attn: SingleStreamBlock, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = attn.modulation(vec)
+        x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
+
+        if attn.fused_qkv:
+            qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+            q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        else:
+            q = rearrange(attn.q_proj(x_mod), "B L (H D) -> B L H D", H=attn.num_heads)
+            k = rearrange(attn.k_proj(x_mod), "B L (H D) -> B L H D", H=attn.num_heads)
+            v, mlp = torch.split(attn.v_mlp(x_mod), [attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+            v = rearrange(v, "B L (H D) -> B L H D", H=attn.num_heads)
+        q, k = attn.norm(q, k, v)
+        if not attn.fused_qkv:
+            q = rearrange(q, "B L H D -> B H L D")
+            k = rearrange(k, "B L H D -> B H L D")
+            v = rearrange(v, "B L H D -> B H L D")
+
+        if (
+            self.shard_config.enable_sequence_parallelism
+            and self.shard_config.sequence_parallelism_mode == "all_to_all"
+        ):
+            assert (
+                attn.num_heads % self.shard_config.sequence_parallel_size == 0
+            ), f"Expected num heads({attn.num_heads}) % sp size({self.shard_config.sequence_parallel_size}) == 0"
+            q = all_to_all_comm(q, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2)
+            k = all_to_all_comm(k, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2)
+            v = all_to_all_comm(v, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2)
+
+        # compute attention
+        if self.shard_config.enable_sequence_parallelism and self.shard_config.sequence_parallelism_mode == "ring_attn":
+            attn_1 = ring_attention(q, k, v, pe, self.shard_config.sequence_parallel_process_group)
+        else:
+            attn_1 = attention(q, k, v, pe=pe)
+
+        if (
+            self.shard_config.enable_sequence_parallelism
+            and self.shard_config.sequence_parallelism_mode == "all_to_all"
+        ):
+            attn_1 = all_to_all_comm(
+                attn_1, self.shard_config.sequence_parallel_process_group, scatter_dim=1, gather_dim=2
+            )
+
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = attn.linear2(torch.cat((attn_1, attn.mlp_act(mlp)), 2))
+        output = x + mod.gate * output
+        return output
+
+
+class _TempSwitchCP(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, shard_config: ShardConfig, value: bool):
+        ctx.old_value = shard_config.enable_sequence_parallelism
+        ctx.shard_config = shard_config
+        shard_config.enable_sequence_parallelism = value
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        print(f"in backward, sp mode: {ctx.shard_config.enable_sequence_parallelism}")
+        ctx.shard_config.enable_sequence_parallelism = ctx.old_value
+        return grad_output, None, None
+
+
+def switch_sequence_parallelism(input_, shard_config: ShardConfig, value: bool):
+    return _TempSwitchCP.apply(input_, shard_config, value)
+
+
+def mmdit_model_forward(
+    self: MMDiTModel,
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    timesteps: Tensor,
+    y_vec: Tensor,
+    cond: Tensor = None,
+    guidance: Tensor | None = None,
+    shard_config: ShardConfig = None,
+    stage_index: Optional[List[int]] = None,
+    internal_img: Optional[Tensor] = None,
+    internal_txt: Optional[Tensor] = None,
+    internal_pe: Optional[Tensor] = None,
+    internal_vec: Optional[Tensor] = None,
+    **kwargs,
+):
+    txt_len = txt.shape[1]
+    if shard_config.pipeline_stage_manager is None or shard_config.pipeline_stage_manager.is_first_stage():
+        img, txt, vec, pe = self.prepare_block_inputs(img, img_ids, txt, txt_ids, timesteps, y_vec, cond, guidance)
+        has_grad = img.grad_fn is not None
+        old_sequence_parallelism = shard_config.enable_sequence_parallelism
+        if shard_config.enable_sequence_parallelism:
+            assert (
+                txt.shape[1] + img.shape[1]
+            ) % shard_config.sequence_parallel_size == 0, (
+                f"Expected {txt.shape[1] +img.shape[1]} % {shard_config.sequence_parallel_size} == 0"
+            )
+            mask = torch.zeros(txt.shape[1] + img.shape[1], dtype=bool)
+            mask[txt.shape[1] :] = 1
+            mask_chunks = mask.chunk(shard_config.sequence_parallel_size)
+            cur_mask = mask_chunks[dist.get_rank(shard_config.sequence_parallel_process_group)]
+            txt_splits = [len(c) - c.sum().item() for c in mask_chunks]
+            img_splits = [c.sum().item() for c in mask_chunks]
+            if 0 in img_splits:
+                # temporarily disable sequence parallelism to avoid stucking
+                img = switch_sequence_parallelism(img, shard_config, False)
+            else:
+                img = split_forward_gather_backward_var_len(
+                    img, 1, shard_config.sequence_parallel_process_group, img_splits
+                )
+                txt = split_forward_gather_backward_var_len(
+                    txt, 1, shard_config.sequence_parallel_process_group, txt_splits
+                )
+                if shard_config.sequence_parallelism_mode == "ring_attn":
+                    # pe does not require grad
+                    sp_rank = dist.get_rank(shard_config.sequence_parallel_process_group)
+                    if isinstance(pe, torch.Tensor):
+                        pe = pe.chunk(shard_config.sequence_parallel_size, dim=2)[sp_rank].clone()
+                    else:
+                        cos, sin = pe
+                        cos = cos.chunk(shard_config.sequence_parallel_size, dim=1)[sp_rank].clone()
+                        sin = sin.chunk(shard_config.sequence_parallel_size, dim=1)[sp_rank].clone()
+                        pe = (cos, sin)
+    else:
+        img, txt, vec, pe = internal_img, internal_txt, internal_vec, internal_pe
+
+    double_start, double_end = 0, len(self.double_blocks)
+    if shard_config.pipeline_stage_manager is not None:
+        double_start = stage_index[0]
+        double_end = min(stage_index[1], len(self.double_blocks))
+
+    for block in self.double_blocks[double_start:double_end]:
+        img, txt = auto_grad_checkpoint(block, img, txt, vec, pe)
+
+    if shard_config.pipeline_stage_manager is not None and stage_index[1] <= len(self.double_blocks):
+        return {
+            "internal_img": img,
+            "internal_txt": txt,
+            "internal_pe": pe,
+            "internal_vec": vec,
+        }
+    single_start, single_end = 0, len(self.single_blocks)
+    if shard_config.pipeline_stage_manager is not None:
+        single_start = max(stage_index[0] - len(self.double_blocks), 0)
+        single_end = stage_index[1] - len(self.double_blocks)
+
+    if single_start == 0:
+        img = torch.cat((txt, img), 1)
+
+    for block in self.single_blocks[single_start:single_end]:
+        img = auto_grad_checkpoint(block, img, vec, pe)
+
+    if shard_config.pipeline_stage_manager is not None and single_end < len(self.single_blocks):
+        return {
+            "internal_img": img,
+            "internal_pe": pe,
+            "internal_vec": vec,
+        }
+
+    if shard_config.enable_sequence_parallelism:
+        img = img[:, cur_mask]
+    else:
+        img = img[:, txt_len:]
+
+    img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+
+    if shard_config.enable_sequence_parallelism:
+        img = gather_forward_split_backward_var_len(img, 1, shard_config.sequence_parallel_process_group, img_splits)
+
+    if not has_grad:
+        shard_config.enable_sequence_parallelism = old_sequence_parallelism
+    return img
+
+
+class MMDiTPolicy(Policy):
+    def config_sanity_check(self):
+        if self.shard_config.enable_sequence_parallelism and is_share_sp_tp(
+            self.shard_config.sequence_parallelism_mode
+        ):
+            assert self.shard_config.enable_tensor_parallelism, "Tensor parallelism should be enabled"
+
+    def preprocess(self) -> nn.Module:
+        return self.model
+
+    def postprocess(self) -> nn.Module:
+        return self.model
+
+    def tie_weight_check(self) -> bool:
+        return False
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        policy = {
+            DoubleStreamBlock: ModulePolicyDescription(attribute_replacement={}, sub_module_replacement=[]),
+            SingleStreamBlock: ModulePolicyDescription(attribute_replacement={}, sub_module_replacement=[]),
+        }
+
+        if self.shard_config.enable_sequence_parallelism:
+            if not is_share_sp_tp(self.shard_config.sequence_parallelism_mode):
+                policy[DoubleStreamBlock].attribute_replacement["processor"] = DistributedDoubleStreamBlockProcessor(
+                    self.shard_config
+                )
+                policy[SingleStreamBlock].attribute_replacement["processor"] = DistributedSingleStreamBlockProcessor(
+                    self.shard_config
+                )
+        if self.shard_config.enable_sequence_parallelism or self.shard_config.pipeline_stage_manager is not None:
+            fwd_fn = partial(mmdit_model_forward, shard_config=self.shard_config)
+            if self.shard_config.pipeline_stage_manager is not None:
+                layers_per_stage = self.shard_config.pipeline_stage_manager.distribute_layers(
+                    len(self.model.double_blocks) + len(self.model.single_blocks)
+                )
+                if self.shard_config.pipeline_stage_manager.is_interleave:
+                    self.shard_config.pipeline_stage_manager.stage_indices = (
+                        self.shard_config.pipeline_stage_manager.get_stage_index(layers_per_stage)
+                    )
+                else:
+                    stage_index = self.shard_config.pipeline_stage_manager.get_stage_index(layers_per_stage)
+                    fwd_fn = partial(mmdit_model_forward, shard_config=self.shard_config, stage_index=stage_index)
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": fwd_fn,
+                },
+                policy=policy,
+                target_key=MMDiTModel,
+            )
+
+        if self.shard_config.enable_tensor_parallelism:
+            mlp_hidden_size = int(self.model.config.hidden_size * self.model.config.mlp_ratio)
+            assert (
+                self.model.config.num_heads % self.shard_config.tensor_parallel_size == 0
+                and mlp_hidden_size % self.shard_config.tensor_parallel_size == 0
+            ), "num_heads and hidden_size should be divisible by tensor_parallel_size"
+            for n in ["img", "txt"]:
+                if self.model.config.fused_qkv:
+                    policy[DoubleStreamBlock].sub_module_replacement.append(
+                        SubModuleReplacementDescription(
+                            suffix=f"{n}_attn.qkv",
+                            target_module=FusedLinear1D_Col,
+                            kwargs={
+                                "split_sizes": [self.model.config.hidden_size] * 3,
+                                "seq_parallel_mode": self.shard_config.sequence_parallelism_mode,
+                            },
+                        ),
+                    )
+                else:
+                    policy[DoubleStreamBlock].sub_module_replacement.extend(
+                        [
+                            SubModuleReplacementDescription(
+                                suffix=f"{n}_attn.q_proj",
+                                target_module=Linear1D_Col,
+                                kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                            ),
+                            SubModuleReplacementDescription(
+                                suffix=f"{n}_attn.k_proj",
+                                target_module=Linear1D_Col,
+                                kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                            ),
+                            SubModuleReplacementDescription(
+                                suffix=f"{n}_attn.v_proj",
+                                target_module=Linear1D_Col,
+                                kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                            ),
+                        ]
+                    )
+                policy[DoubleStreamBlock].sub_module_replacement.extend(
+                    [
+                        SubModuleReplacementDescription(
+                            suffix=f"{n}_attn.proj",
+                            target_module=Linear1D_Row,
+                            kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix=f"{n}_mlp[0]",
+                            target_module=Linear1D_Col,
+                            kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix=f"{n}_mlp[2]",
+                            target_module=Linear1D_Row,
+                            kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                        ),
+                    ]
+                )
+            policy[DoubleStreamBlock].attribute_replacement["num_heads"] = (
+                self.model.config.num_heads // self.shard_config.tensor_parallel_size
+            )
+            policy[SingleStreamBlock].attribute_replacement.update(
+                {
+                    "num_heads": self.model.config.num_heads // self.shard_config.tensor_parallel_size,
+                    "hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                    "mlp_hidden_dim": mlp_hidden_size // self.shard_config.tensor_parallel_size,
+                }
+            )
+            if self.model.config.fused_qkv:
+                policy[SingleStreamBlock].sub_module_replacement.append(
+                    SubModuleReplacementDescription(
+                        suffix="linear1",
+                        target_module=FusedLinear1D_Col,
+                        kwargs={
+                            "split_sizes": [self.model.config.hidden_size] * 3 + [mlp_hidden_size],
+                            "seq_parallel_mode": self.shard_config.sequence_parallelism_mode,
+                        },
+                    ),
+                )
+            else:
+                policy[SingleStreamBlock].sub_module_replacement.extend(
+                    [
+                        SubModuleReplacementDescription(
+                            suffix="q_proj",
+                            target_module=Linear1D_Col,
+                            kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="k_proj",
+                            target_module=Linear1D_Col,
+                            kwargs={"seq_parallel_mode": self.shard_config.sequence_parallelism_mode},
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="v_mlp",
+                            target_module=FusedLinear1D_Col,
+                            kwargs={
+                                "split_sizes": [self.model.config.hidden_size] + [mlp_hidden_size],
+                                "seq_parallel_mode": self.shard_config.sequence_parallelism_mode,
+                            },
+                        ),
+                    ]
+                )
+            policy[SingleStreamBlock].sub_module_replacement.extend(
+                [
+                    SubModuleReplacementDescription(
+                        suffix="linear2",
+                        target_module=FusedLinear1D_Row,
+                        kwargs={
+                            "split_sizes": [self.model.config.hidden_size, mlp_hidden_size],
+                            "seq_parallel_mode": self.shard_config.sequence_parallelism_mode,
+                        },
+                    ),
+                ],
+            )
+
+        return policy
+
+    def get_held_layers(self) -> List[nn.Module]:
+        stage_manager = self.shard_config.pipeline_stage_manager
+        assert stage_manager is not None, "Pipeline stage manager is not set"
+
+        held_layers = []
+        total_blocks = [*self.model.double_blocks, *self.model.single_blocks]
+        if stage_manager.is_first_stage(ignore_chunk=stage_manager.is_interleave):
+            held_layers.extend(
+                [
+                    self.model.pe_embedder,
+                    self.model.img_in,
+                    self.model.time_in,
+                    self.model.vector_in,
+                    self.model.guidance_in,
+                    self.model.cond_in,
+                    self.model.txt_in,
+                ]
+            )
+
+        layers_per_stage = stage_manager.distribute_layers(len(total_blocks))
+        if stage_manager.is_interleave:
+            assert stage_manager.num_model_chunks is not None
+            stage_indices = stage_manager.get_stage_index(layers_per_stage)
+            for start_idx, end_idx in stage_indices:
+                held_layers.extend(total_blocks[start_idx:end_idx])
+        else:
+            start_idx, end_idx = stage_manager.get_stage_index(layers_per_stage)
+            held_layers.extend(total_blocks[start_idx:end_idx])
+        if stage_manager.is_last_stage(ignore_chunk=stage_manager.is_interleave):
+            held_layers.append(self.model.final_layer)
+        return held_layers
diff --git a/opensora/models/mmdit/layers.py b/opensora/models/mmdit/layers.py
new file mode 100644
index 0000000..7fa8d3d
--- /dev/null
+++ b/opensora/models/mmdit/layers.py
@@ -0,0 +1,402 @@
+# Modified from Flux
+#
+# Copyright 2024 Black Forest Labs
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+
+import torch
+from einops import rearrange
+from liger_kernel.ops.rms_norm import LigerRMSNormFunction
+from torch import Tensor, nn
+
+from .math import attention, liger_rope, rope
+
+
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+
+
+class LigerEmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        cos_list = []
+        sin_list = []
+        for i in range(n_axes):
+            cos, sin = liger_rope(ids[..., i], self.axes_dim[i], self.theta)
+            cos_list.append(cos)
+            sin_list.append(sin)
+        cos_emb = torch.cat(cos_list, dim=-1).repeat(1, 1, 2).contiguous()
+        sin_emb = torch.cat(sin_list, dim=-1).repeat(1, 1, 2).contiguous()
+
+        return (cos_emb, sin_emb)
+
+
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+
+
+class FusedRMSNorm(RMSNorm):
+    def forward(self, x: Tensor):
+        return LigerRMSNormFunction.apply(
+            x,
+            self.scale,
+            1e-6,
+            0.0,
+            "llama",
+            False,
+        )
+
+
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = FusedRMSNorm(dim)
+        self.key_norm = FusedRMSNorm(dim)
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, fused_qkv: bool = True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.fused_qkv = fused_qkv
+        head_dim = dim // num_heads
+
+        if fused_qkv:
+            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        else:
+            self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+            self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
+            self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        if self.fused_qkv:
+            qkv = self.qkv(x)
+            q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        else:
+            q = rearrange(self.q_proj(x), "B L (H D) -> B L H D", H=self.num_heads)
+            k = rearrange(self.k_proj(x), "B L (H D) -> B L H D", H=self.num_heads)
+            v = rearrange(self.v_proj(x), "B L (H D) -> B L H D", H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        if not self.fused_qkv:
+            q = rearrange(q, "B L H D -> B H L D")
+            k = rearrange(k, "B L H D -> B H L D")
+            v = rearrange(v, "B L H D -> B H L D")
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+
+
+class DoubleStreamBlockProcessor:
+    def __call__(self, attn: nn.Module, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+        # attn is the DoubleStreamBlock;
+        # process img and txt separately while both is influenced by text vec
+
+        # vec will interact with image latent and text context
+        img_mod1, img_mod2 = attn.img_mod(vec)  # get shift, scale, gate for each mod
+        txt_mod1, txt_mod2 = attn.txt_mod(vec)
+
+        # prepare image for attention
+        img_modulated = attn.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+
+        if attn.img_attn.fused_qkv:
+            img_qkv = attn.img_attn.qkv(img_modulated)
+            img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        else:
+            img_q = rearrange(attn.img_attn.q_proj(img_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            img_k = rearrange(attn.img_attn.k_proj(img_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            img_v = rearrange(attn.img_attn.v_proj(img_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+
+        img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)  # RMSNorm for QK Norm as in SD3 paper
+        if not attn.img_attn.fused_qkv:
+            img_q = rearrange(img_q, "B L H D -> B H L D")
+            img_k = rearrange(img_k, "B L H D -> B H L D")
+            img_v = rearrange(img_v, "B L H D -> B H L D")
+
+        # prepare txt for attention
+        txt_modulated = attn.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        if attn.txt_attn.fused_qkv:
+            txt_qkv = attn.txt_attn.qkv(txt_modulated)
+            txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        else:
+            txt_q = rearrange(attn.txt_attn.q_proj(txt_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            txt_k = rearrange(attn.txt_attn.k_proj(txt_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+            txt_v = rearrange(attn.txt_attn.v_proj(txt_modulated), "B L (H D) -> B L H D", H=attn.num_heads)
+        txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
+        if not attn.txt_attn.fused_qkv:
+            txt_q = rearrange(txt_q, "B L H D -> B H L D")
+            txt_k = rearrange(txt_k, "B L H D -> B H L D")
+            txt_v = rearrange(txt_v, "B L H D -> B H L D")
+
+        # run actual attention, image and text attention are calculated together by concat different attn heads
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        attn1 = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn1[:, : txt_q.shape[2]], attn1[:, txt_q.shape[2] :]
+
+        # calculate the img bloks
+        img = img + img_mod1.gate * attn.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
+
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+
+
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+        fused_qkv: bool = True,
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.head_dim = hidden_size // num_heads
+
+        # image stream
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, fused_qkv=fused_qkv)
+
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        # text stream
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, fused_qkv=fused_qkv)
+
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        # processor
+        processor = DoubleStreamBlockProcessor()
+        self.set_processor(processor)
+
+    def set_processor(self, processor) -> None:
+        self.processor = processor
+
+    def get_processor(self):
+        return self.processor
+
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, **kwargs) -> tuple[Tensor, Tensor]:
+        return self.processor(self, img, txt, vec, pe)
+
+
+class SingleStreamBlockProcessor:
+    def __call__(self, attn: nn.Module, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = attn.modulation(vec)
+        x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
+        if attn.fused_qkv:
+            qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+            q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        else:
+            q = rearrange(attn.q_proj(x_mod), "B L (H D) -> B L H D", H=attn.num_heads)
+            k = rearrange(attn.k_proj(x_mod), "B L (H D) -> B L H D", H=attn.num_heads)
+            v, mlp = torch.split(attn.v_mlp(x_mod), [attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+            v = rearrange(v, "B L (H D) -> B L H D", H=attn.num_heads)
+
+        q, k = attn.norm(q, k, v)
+        if not attn.fused_qkv:
+            q = rearrange(q, "B L H D -> B H L D")
+            k = rearrange(k, "B L H D -> B H L D")
+            v = rearrange(v, "B L H D -> B H L D")
+
+        # compute attention
+        attn_1 = attention(q, k, v, pe=pe)
+
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = attn.linear2(torch.cat((attn_1, attn.mlp_act(mlp)), 2))
+        output = x + mod.gate * output
+        return output
+
+
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+        fused_qkv: bool = True,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.fused_qkv = fused_qkv
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        if fused_qkv:
+            # qkv and mlp_in
+            self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        else:
+            self.q_proj = nn.Linear(hidden_size, hidden_size)
+            self.k_proj = nn.Linear(hidden_size, hidden_size)
+            self.v_mlp = nn.Linear(hidden_size, hidden_size + self.mlp_hidden_dim)
+
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+        self.norm = QKNorm(self.head_dim)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+
+        processor = SingleStreamBlockProcessor()
+        self.set_processor(processor)
+
+    def set_processor(self, processor) -> None:
+        self.processor = processor
+
+    def get_processor(self):
+        return self.processor
+
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, **kwargs) -> Tensor:
+        return self.processor(self, x, vec, pe)
+
+
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
diff --git a/opensora/models/mmdit/math.py b/opensora/models/mmdit/math.py
new file mode 100644
index 0000000..f09cd97
--- /dev/null
+++ b/opensora/models/mmdit/math.py
@@ -0,0 +1,117 @@
+import torch
+from einops import rearrange
+from flash_attn import flash_attn_func as flash_attn_func_v2
+from liger_kernel.ops.rope import LigerRopeFunction
+from torch import Tensor
+from typing import Tuple
+
+try:
+    from flash_attn_interface import flash_attn_func as flash_attn_func_v3
+
+    SUPPORT_FA3 = True
+except:
+    SUPPORT_FA3 = False
+
+
+def flash_attn_func(q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+    if SUPPORT_FA3:
+        return flash_attn_func_v3(q, k, v)[0]
+    return flash_attn_func_v2(q, k, v)
+
+
+def attention(q: Tensor, k: Tensor, v: Tensor, pe) -> Tensor:
+    if isinstance(pe, torch.Tensor):
+        q, k = apply_rope(q, k, pe)
+    else:
+        cos, sin = pe
+        q, k = LigerRopeFunction.apply(q, k, cos, sin)
+        # to compare with the original implementation
+        # k = reverse_rearrange_tensor(k)
+    q = rearrange(q, "B H L D -> B L H D")
+    k = rearrange(k, "B H L D -> B L H D")
+    v = rearrange(v, "B H L D -> B L H D")
+    x = flash_attn_func(q, k, v)
+    x = rearrange(x, "B L H D -> B L (H D)")
+
+    return x
+
+
+def liger_rope(pos: Tensor, dim: int, theta: int) -> Tuple:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)  # (b, seq, dim//2)
+    cos = out.cos()
+    sin = out.sin()
+
+    return (cos, sin)
+
+
+def rope(pos: Tensor, dim: int, theta: int) -> Tuple:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+
+
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+
+
+def rearrange_tensor(tensor):
+    """
+    Rearranges the last dimension (D) of the input tensor based on the specified mapping:
+    2d -> d, 2d+1 -> D/2 + d.
+
+    Args:
+        tensor (torch.Tensor): Input tensor of shape [B, H, L, D], where D is even.
+
+    Returns:
+        torch.Tensor: Tensor with rearranged last dimension, same shape as input.
+    """
+    B, H, L, D = tensor.shape
+    if D % 2 != 0:
+        raise ValueError("The last dimension D must be even.")
+
+    half_D = D // 2
+    indices = torch.empty(D, dtype=torch.long, device=tensor.device)
+
+    # Fill the indices based on the mapping rule
+    indices[:half_D] = torch.arange(0, D, 2, device=tensor.device)
+    indices[half_D:] = torch.arange(1, D, 2, device=tensor.device)
+
+    # Rearrange the tensor based on the computed indices
+    return tensor.index_select(dim=-1, index=indices)
+
+
+def reverse_rearrange_tensor(tensor):
+    """
+    Restores the original order of the last dimension (D) of the input tensor based on the reverse mapping:
+    d -> 2d, D/2 + d -> 2d + 1.
+
+    Args:
+        tensor (torch.Tensor): Input tensor of shape [B, H, L, D], where D is even.
+
+    Returns:
+        torch.Tensor: Tensor with restored original last dimension order, same shape as input.
+    """
+    B, H, L, D = tensor.shape
+    if D % 2 != 0:
+        raise ValueError("The last dimension D must be even.")
+
+    half_D = D // 2
+    reverse_indices = torch.empty(D, dtype=torch.long, device=tensor.device)
+
+    # Fill the reverse indices to restore the original order
+    reverse_indices[::2] = torch.arange(half_D, device=tensor.device)
+    reverse_indices[1::2] = torch.arange(half_D, D, device=tensor.device)
+
+    # Rearrange the tensor based on the reverse indices
+    return tensor.index_select(dim=-1, index=reverse_indices)
diff --git a/opensora/models/mmdit/model.py b/opensora/models/mmdit/model.py
new file mode 100644
index 0000000..29c8122
--- /dev/null
+++ b/opensora/models/mmdit/model.py
@@ -0,0 +1,303 @@
+# Modified from Flux
+#
+# Copyright 2024 Black Forest Labs
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+
+import torch
+from torch import Tensor, nn
+
+from opensora.acceleration.checkpoint import auto_grad_checkpoint
+from opensora.models.mmdit.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    LigerEmbedND,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+from opensora.registry import MODELS
+from opensora.utils.ckpt import load_checkpoint
+
+
+@dataclass
+class MMDiTConfig:
+    model_type = "MMDiT"
+    from_pretrained: str
+    cache_dir: str
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+    cond_embed: bool = False
+    fused_qkv: bool = True
+    grad_ckpt_settings: tuple[int, int] | None = None
+    use_liger_rope: bool = False
+    patch_size: int = 2
+
+    def get(self, attribute_name, default=None):
+        return getattr(self, attribute_name, default)
+
+    def __contains__(self, attribute_name):
+        return hasattr(self, attribute_name)
+
+
+class MMDiTModel(nn.Module):
+    config_class = MMDiTConfig
+
+    def __init__(self, config: MMDiTConfig):
+        super().__init__()
+
+        self.config = config
+        self.in_channels = config.in_channels
+        self.out_channels = self.in_channels
+        self.patch_size = config.patch_size
+
+        if config.hidden_size % config.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {config.hidden_size} must be divisible by num_heads {config.num_heads}"
+            )
+
+        pe_dim = config.hidden_size // config.num_heads
+        if sum(config.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {config.axes_dim} but expected positional dim {pe_dim}"
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_heads
+        pe_embedder_cls = LigerEmbedND if config.use_liger_rope else EmbedND
+        self.pe_embedder = pe_embedder_cls(
+            dim=pe_dim, theta=config.theta, axes_dim=config.axes_dim
+        )
+
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(config.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            if config.guidance_embed
+            else nn.Identity()
+        )
+        self.cond_in = (
+            nn.Linear(
+                self.in_channels + self.patch_size**2, self.hidden_size, bias=True
+            )
+            if config.cond_embed
+            else nn.Identity()
+        )
+        self.txt_in = nn.Linear(config.context_in_dim, self.hidden_size)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=config.mlp_ratio,
+                    qkv_bias=config.qkv_bias,
+                    fused_qkv=config.fused_qkv,
+                )
+                for _ in range(config.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=config.mlp_ratio,
+                    fused_qkv=config.fused_qkv,
+                )
+                for _ in range(config.depth_single_blocks)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.initialize_weights()
+
+        if self.config.grad_ckpt_settings:
+            self.forward = self.forward_selective_ckpt
+        else:
+            self.forward = self.forward_ckpt
+        self._input_requires_grad = False
+
+    def initialize_weights(self):
+        if self.config.cond_embed:
+            nn.init.zeros_(self.cond_in.weight)
+            nn.init.zeros_(self.cond_in.bias)
+
+    def prepare_block_inputs(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,  # t5 encoded vec
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y_vec: Tensor,  # clip encoded vec
+        cond: Tensor = None,
+        guidance: Tensor | None = None,
+    ):
+        """
+        obtain the processed:
+            img: projected noisy img latent,
+            txt: text context (from t5),
+            vec: clip encoded vector,
+            pe: the positional embeddings for concatenated img and txt
+        """
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+        if self.config.cond_embed:
+            if cond is None:
+                raise ValueError("Didn't get conditional input for conditional model.")
+            img = img + self.cond_in(cond)
+
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.config.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y_vec)
+
+        txt = self.txt_in(txt)
+
+        # concat: 4096 + t*h*2/4
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        if self._input_requires_grad:
+            # we only apply lora to double/single blocks, thus we only need to enable grad for these inputs
+            img.requires_grad_()
+            txt.requires_grad_()
+
+        return img, txt, vec, pe
+
+    def enable_input_require_grads(self):
+        """Fit peft lora. This method should not be called manually."""
+        self._input_requires_grad = True
+
+    def forward_ckpt(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y_vec: Tensor,
+        cond: Tensor = None,
+        guidance: Tensor | None = None,
+        **kwargs,
+    ) -> Tensor:
+        img, txt, vec, pe = self.prepare_block_inputs(
+            img, img_ids, txt, txt_ids, timesteps, y_vec, cond, guidance
+        )
+
+        for block in self.double_blocks:
+            img, txt = auto_grad_checkpoint(block, img, txt, vec, pe)
+
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = auto_grad_checkpoint(block, img, vec, pe)
+        img = img[:, txt.shape[1] :, ...]
+
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+
+    def forward_selective_ckpt(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y_vec: Tensor,
+        cond: Tensor = None,
+        guidance: Tensor | None = None,
+        **kwargs,
+    ) -> Tensor:
+        img, txt, vec, pe = self.prepare_block_inputs(
+            img, img_ids, txt, txt_ids, timesteps, y_vec, cond, guidance
+        )
+
+        ckpt_depth_double = self.config.grad_ckpt_settings[0]
+        for block in self.double_blocks[:ckpt_depth_double]:
+            img, txt = auto_grad_checkpoint(block, img, txt, vec, pe)
+
+        for block in self.double_blocks[ckpt_depth_double:]:
+            img, txt = block(img, txt, vec, pe)
+
+        ckpt_depth_single = self.config.grad_ckpt_settings[1]
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks[:ckpt_depth_single]:
+            img = auto_grad_checkpoint(block, img, vec, pe)
+        for block in self.single_blocks[ckpt_depth_single:]:
+            img = block(img, vec, pe)
+
+        img = img[:, txt.shape[1] :, ...]
+
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+
+
+@MODELS.register_module("flux")
+def Flux(
+    cache_dir: str = None,
+    from_pretrained: str = None,
+    device_map: str | torch.device = "cuda",
+    torch_dtype: torch.dtype = torch.bfloat16,
+    strict_load: bool = False,
+    **kwargs,
+) -> MMDiTModel:
+    config = MMDiTConfig(
+        from_pretrained=from_pretrained,
+        cache_dir=cache_dir,
+        **kwargs,
+    )
+    low_precision_init = from_pretrained is not None and len(from_pretrained) > 0
+    if low_precision_init:
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(torch_dtype)
+    with torch.device(device_map):
+        model = MMDiTModel(config)
+    if low_precision_init:
+        torch.set_default_dtype(default_dtype)
+    else:
+        model = model.to(torch_dtype)
+    if from_pretrained:
+        model = load_checkpoint(
+            model,
+            from_pretrained,
+            cache_dir=cache_dir,
+            device_map=device_map,
+            strict=strict_load,
+        )
+    return model
diff --git a/opensora/models/mmdit/policy.py b/opensora/models/mmdit/policy.py
new file mode 100644
index 0000000..bfaf8e4
--- /dev/null
+++ b/opensora/models/mmdit/policy.py
@@ -0,0 +1,155 @@
+from functools import partial
+from typing import Dict, Union
+
+import torch.nn as nn
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+from opensora.models.vae.tensor_parallel import Conv3dTPCol, Conv3dTPRow, GroupNormTP
+
+from .distributed import ContextParallelAttention, TPUpDecoderBlockCausal3D, prepare_parallel_attention_mask
+from .vae import DecoderCausal3D, EncoderCausal3D
+
+
+def gen_resnets_replacements(prefix: str, with_shortcut: bool = False):
+    replacements = [
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.norm1",
+            target_module=GroupNormTP,
+        ),
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.conv1.conv",
+            target_module=Conv3dTPRow,
+            kwargs=dict(
+                split_output=True,
+            ),
+        ),
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.norm2",
+            target_module=GroupNormTP,
+        ),
+        SubModuleReplacementDescription(
+            suffix=f"{prefix}.conv2.conv",
+            target_module=Conv3dTPRow,
+            kwargs=dict(
+                split_output=True,
+            ),
+        ),
+    ]
+    if with_shortcut:
+        replacements.append(
+            SubModuleReplacementDescription(
+                suffix=f"{prefix}.conv_shortcut.conv",
+                target_module=Conv3dTPRow,
+                kwargs=dict(
+                    split_output=True,
+                ),
+            )
+        )
+    return replacements
+
+
+class HunyuanVaePolicy(Policy):
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        policy = {}
+
+        policy[EncoderCausal3D] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="conv_in.conv",
+                    target_module=Conv3dTPCol,
+                ),
+                *gen_resnets_replacements("down_blocks[0].resnets[0]"),
+                *gen_resnets_replacements("down_blocks[0].resnets[1]"),
+                SubModuleReplacementDescription(
+                    suffix="down_blocks[0].downsamplers[0].conv.conv",
+                    target_module=Conv3dTPRow,
+                    kwargs=dict(
+                        split_output=True,
+                    ),
+                ),
+                *gen_resnets_replacements("down_blocks[1].resnets[0]", with_shortcut=True),
+                *gen_resnets_replacements("down_blocks[1].resnets[1]"),
+                SubModuleReplacementDescription(
+                    suffix="down_blocks[1].downsamplers[0].conv.conv",
+                    target_module=Conv3dTPRow,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="mid_block.attentions[0]",
+                    target_module=ContextParallelAttention,
+                ),
+            ],
+            attribute_replacement={
+                "down_blocks[0].downsamplers[0].channels": self.model.encoder.down_blocks[0].downsamplers[0].channels
+                // self.shard_config.tensor_parallel_size,
+                "down_blocks[1].downsamplers[0].channels": self.model.encoder.down_blocks[1].downsamplers[0].channels
+                // self.shard_config.tensor_parallel_size,
+                # "mid_block.attentions[0].processor": MemEfficientRingAttnProcessor(
+                #     self.shard_config.tensor_parallel_process_group
+                # ),
+            },
+            method_replacement={
+                "prepare_attention_mask": partial(
+                    prepare_parallel_attention_mask, cp_group=self.shard_config.tensor_parallel_process_group
+                ),
+            },
+        )
+
+        policy[DecoderCausal3D] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="up_blocks[1].upsamplers[0]",
+                    target_module=TPUpDecoderBlockCausal3D,
+                    kwargs=dict(
+                        split_output=True,
+                    ),
+                ),
+                *gen_resnets_replacements("up_blocks[2].resnets[0]", with_shortcut=True),
+                *gen_resnets_replacements("up_blocks[2].resnets[1]"),
+                *gen_resnets_replacements("up_blocks[2].resnets[2]"),
+                SubModuleReplacementDescription(
+                    suffix="up_blocks[2].upsamplers[0].conv.conv",
+                    target_module=Conv3dTPRow,
+                    kwargs=dict(
+                        split_output=True,
+                    ),
+                ),
+                *gen_resnets_replacements("up_blocks[3].resnets[0]", with_shortcut=True),
+                *gen_resnets_replacements("up_blocks[3].resnets[1]"),
+                *gen_resnets_replacements("up_blocks[3].resnets[2]"),
+                SubModuleReplacementDescription(
+                    suffix="conv_norm_out",
+                    target_module=GroupNormTP,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="conv_out.conv",
+                    target_module=Conv3dTPRow,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="mid_block.attentions[0]",
+                    target_module=ContextParallelAttention,
+                ),
+            ],
+            attribute_replacement={
+                "up_blocks[2].upsamplers[0].channels": self.model.decoder.up_blocks[2].upsamplers[0].channels
+                // self.shard_config.tensor_parallel_size,
+                # "mid_block.attentions[0].processor": MemEfficientRingAttnProcessor(
+                #     self.shard_config.tensor_parallel_process_group
+                # ),
+            },
+            method_replacement={
+                "prepare_attention_mask": partial(
+                    prepare_parallel_attention_mask, cp_group=self.shard_config.tensor_parallel_process_group
+                ),
+            },
+        )
+
+        return policy
+
+    def postprocess(self):
+        return self.model
diff --git a/opensora/models/text/__init__.py b/opensora/models/text/__init__.py
new file mode 100644
index 0000000..5671c1e
--- /dev/null
+++ b/opensora/models/text/__init__.py
@@ -0,0 +1 @@
+from .conditioner import HFEmbedder
diff --git a/opensora/models/text/conditioner.py b/opensora/models/text/conditioner.py
new file mode 100644
index 0000000..1ac5175
--- /dev/null
+++ b/opensora/models/text/conditioner.py
@@ -0,0 +1,74 @@
+from colossalai.shardformer import ShardConfig, ShardFormer
+from torch import Tensor, nn
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
+
+from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
+from opensora.registry import MODELS
+
+
+@MODELS.register_module("text_embedder")
+class HFEmbedder(nn.Module):
+    def __init__(self, from_pretrained: str, max_length: int, shardformer: bool = False, **hf_kwargs):
+        super().__init__()
+        self.is_clip = "openai" in from_pretrained
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(from_pretrained, max_length=max_length)
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(from_pretrained, **hf_kwargs)
+            assert not shardformer, "Shardformer is not supported for CLIP"
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+                from_pretrained, max_length=max_length, legacy=True
+            )
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(from_pretrained, **hf_kwargs)
+            if shardformer:
+                self.hf_module = shardformer_t5(self.hf_module)
+
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+
+    def forward(self, text: list[str], added_tokens: int = 0, seq_align: int = 1) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        seq_len = batch_encoding["input_ids"].shape[1]
+        if (added_tokens + seq_len) % seq_align != 0:
+            num_pad_tokens = seq_align - (added_tokens + seq_len) % seq_align
+            batch_encoding["input_ids"] = nn.functional.pad(
+                batch_encoding["input_ids"], (0, num_pad_tokens), value=self.tokenizer.pad_token_id
+            )
+
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]
+
+
+def shardformer_t5(t5: T5EncoderModel) -> T5EncoderModel:
+    """
+    Shardformer for T5 model
+
+    Args:
+        t5: T5 model to be optimized
+
+    Returns:
+        optimized T5 model
+    """
+    dtype = t5.shared.weight.dtype
+    shard_config = ShardConfig(
+        enable_tensor_parallelism=False,
+        enable_jit_fused=True,
+    )
+    shard_former = ShardFormer(shard_config=shard_config)
+    optim_model, _ = shard_former.optimize(t5, policy=T5EncoderPolicy())
+    optim_model = optim_model.to(dtype).eval().requires_grad_(False)
+    return optim_model
diff --git a/opensora/models/vae/__init__.py b/opensora/models/vae/__init__.py
new file mode 100644
index 0000000..44f6ede
--- /dev/null
+++ b/opensora/models/vae/__init__.py
@@ -0,0 +1,2 @@
+from .autoencoder_2d import AutoEncoderFlux
+from .discriminator import N_LAYER_DISCRIMINATOR_3D
diff --git a/opensora/models/vae/autoencoder_2d.py b/opensora/models/vae/autoencoder_2d.py
new file mode 100644
index 0000000..f954d05
--- /dev/null
+++ b/opensora/models/vae/autoencoder_2d.py
@@ -0,0 +1,339 @@
+# Modified from Flux
+#
+# Copyright 2024 Black Forest Labs
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from torch.nn.functional import silu as swish
+
+from opensora.registry import MODELS
+from opensora.utils.ckpt import load_checkpoint
+
+from .utils import DiagonalGaussianDistribution
+
+
+@dataclass
+class AutoEncoderConfig:
+    from_pretrained: str | None
+    cache_dir: str | None
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+    sample: bool = True
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        return self.conv(x)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, config: AutoEncoderConfig):
+        super().__init__()
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        self.resolution = config.resolution
+        self.in_channels = config.in_channels
+
+        # downsampling
+        self.conv_in = nn.Conv2d(config.in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = config.resolution
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = config.ch * in_ch_mult[i_level]
+            block_out = config.ch * config.ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * config.z_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(self, config: AutoEncoderConfig):
+        super().__init__()
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        self.resolution = config.resolution
+        self.in_channels = config.in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+
+        block_in = config.ch * config.ch_mult[self.num_resolutions - 1]
+        curr_res = config.resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, config.z_channels, curr_res, curr_res)
+
+        # z to block_in
+        self.conv_in = nn.Conv2d(config.z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = config.ch * config.ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, config.out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        return self.conv_out(h)
+
+
+class AutoEncoder(nn.Module):
+    def __init__(self, config: AutoEncoderConfig):
+        super().__init__()
+        self.encoder = Encoder(config)
+        self.decoder = Decoder(config)
+        self.scale_factor = config.scale_factor
+        self.shift_factor = config.shift_factor
+        self.sample = config.sample
+
+    def encode_(self, x: Tensor) -> tuple[Tensor, DiagonalGaussianDistribution]:
+        T = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        params = self.encoder(x)
+        params = rearrange(params, "(b t) c h w -> b c t h w", t=T)
+        posterior = DiagonalGaussianDistribution(params)
+        if self.sample:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        z = self.scale_factor * (z - self.shift_factor)
+        return z, posterior
+
+    def encode(self, x: Tensor) -> Tensor:
+        return self.encode_(x)[0]
+
+    def decode(self, z: Tensor) -> Tensor:
+        T = z.shape[2]
+        z = rearrange(z, "b c t h w -> (b t) c h w")
+        z = z / self.scale_factor + self.shift_factor
+        x = self.decoder(z)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=T)
+        return x
+
+    def forward(self, x: Tensor) -> tuple[Tensor, DiagonalGaussianDistribution, Tensor]:
+        # encode
+        x.shape[2]
+        z, posterior = self.encode_(x)
+        # decode
+        x_rec = self.decode(z)
+
+        return x_rec, posterior, z
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+
+
+@MODELS.register_module("autoencoder_2d")
+def AutoEncoderFlux(
+    from_pretrained: str,
+    cache_dir=None,
+    resolution=256,
+    in_channels=3,
+    ch=128,
+    out_ch=3,
+    ch_mult=[1, 2, 4, 4],
+    num_res_blocks=2,
+    z_channels=16,
+    scale_factor=0.3611,
+    shift_factor=0.1159,
+    device_map: str | torch.device = "cuda",
+    torch_dtype: torch.dtype = torch.bfloat16,
+) -> AutoEncoder:
+    config = AutoEncoderConfig(
+        from_pretrained=from_pretrained,
+        cache_dir=cache_dir,
+        resolution=resolution,
+        in_channels=in_channels,
+        ch=ch,
+        out_ch=out_ch,
+        ch_mult=ch_mult,
+        num_res_blocks=num_res_blocks,
+        z_channels=z_channels,
+        scale_factor=scale_factor,
+        shift_factor=shift_factor,
+    )
+    with torch.device(device_map):
+        model = AutoEncoder(config).to(torch_dtype)
+    if from_pretrained:
+        model = load_checkpoint(model, from_pretrained, cache_dir=cache_dir, device_map=device_map)
+    return model
diff --git a/opensora/models/vae/discriminator.py b/opensora/models/vae/discriminator.py
new file mode 100644
index 0000000..48f1cf8
--- /dev/null
+++ b/opensora/models/vae/discriminator.py
@@ -0,0 +1,109 @@
+import os
+
+import torch.nn as nn
+
+from opensora.registry import MODELS
+from opensora.utils.ckpt import load_checkpoint
+
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find("BatchNorm") != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+
+
+def weights_init_conv(m):
+    if hasattr(m, "conv"):
+        m = m.conv
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find("BatchNorm") != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+
+
+class NLayerDiscriminator3D(nn.Module):
+    """Defines a 3D PatchGAN discriminator as in Pix2Pix but for 3D inputs."""
+
+    def __init__(
+        self,
+        input_nc=1,
+        ndf=64,
+        n_layers=5,
+        norm_layer=nn.BatchNorm3d,
+        conv_cls="conv3d",
+        dropout=0.30,
+    ):
+        """
+        Construct a 3D PatchGAN discriminator
+
+        Parameters:
+            input_nc (int)  -- the number of channels in input volumes
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            use_actnorm (bool) -- flag to use actnorm instead of batchnorm
+        """
+        super(NLayerDiscriminator3D, self).__init__()
+        assert conv_cls == "conv3d"
+        use_bias = False
+
+        kw = 3
+        padw = 1
+        sequence = [nn.Conv3d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2**n, 8)
+
+            sequence += [
+                nn.Conv3d(
+                    ndf * nf_mult_prev,
+                    ndf * nf_mult,
+                    kernel_size=(kw, kw, kw),
+                    stride=2 if n == 1 else (1, 2, 2),
+                    padding=padw,
+                    bias=use_bias,
+                ),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True),
+                nn.Dropout(dropout),
+            ]
+
+        nf_mult_prev = nf_mult
+        nf_mult = min(2**n_layers, 8)
+        sequence += [
+            nn.Conv3d(
+                ndf * nf_mult_prev,
+                ndf * nf_mult,
+                kernel_size=(kw, kw, kw),
+                stride=1,
+                padding=padw,
+                bias=use_bias,
+            ),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True),
+            nn.Dropout(dropout),
+            nn.Conv3d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw),
+        ]
+        self.main = nn.Sequential(*sequence)
+
+    def forward(self, x):
+        """Standard forward."""
+        return self.main(x)
+
+
+@MODELS.register_module("N_Layer_discriminator_3D")
+def N_LAYER_DISCRIMINATOR_3D(from_pretrained=None, force_huggingface=None, **kwargs):
+    model = NLayerDiscriminator3D(**kwargs).apply(weights_init)
+    if from_pretrained is not None:
+        if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
+            raise NotImplementedError
+        else:
+            load_checkpoint(model, from_pretrained)
+        print(f"loaded model from: {from_pretrained}")
+    return model
diff --git a/opensora/models/vae/losses.py b/opensora/models/vae/losses.py
new file mode 100644
index 0000000..7d32cc3
--- /dev/null
+++ b/opensora/models/vae/losses.py
@@ -0,0 +1,223 @@
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor, nn
+
+from opensora.models.vae.lpips import LPIPS
+
+
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1.0 - logits_real))
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+
+
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        torch.mean(torch.nn.functional.softplus(-logits_real)) + torch.mean(torch.nn.functional.softplus(logits_fake))
+    )
+    return d_loss
+
+
+def wgan_gp_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (-logits_real.mean() + logits_fake.mean())
+    return d_loss
+
+
+def adopt_weight(weight, global_step, threshold=0, value=0.0):
+    if global_step < threshold:
+        weight = value
+    return weight
+
+
+def measure_perplexity(predicted_indices, n_embed):
+    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
+    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
+    encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
+    avg_probs = encodings.mean(0)
+    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
+    cluster_use = torch.sum(avg_probs > 0)
+    return perplexity, cluster_use
+
+
+def l1(x, y):
+    return torch.abs(x - y)
+
+
+def l2(x, y):
+    return torch.pow((x - y), 2)
+
+
+def batch_mean(x):
+    return torch.sum(x) / x.shape[0]
+
+
+def sigmoid_cross_entropy_with_logits(labels, logits):
+    # The final formulation is: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    zeros = torch.zeros_like(logits, dtype=logits.dtype)
+    condition = logits >= zeros
+    relu_logits = torch.where(condition, logits, zeros)
+    neg_abs_logits = torch.where(condition, -logits, logits)
+    return relu_logits - logits * labels + torch.log1p(torch.exp(neg_abs_logits))
+
+
+def lecam_reg(real_pred, fake_pred, ema_real_pred, ema_fake_pred):
+    assert real_pred.ndim == 0 and ema_fake_pred.ndim == 0
+    lecam_loss = torch.mean(torch.pow(nn.ReLU()(real_pred - ema_fake_pred), 2))
+    lecam_loss += torch.mean(torch.pow(nn.ReLU()(ema_real_pred - fake_pred), 2))
+    return lecam_loss
+
+
+def gradient_penalty_fn(images, output):
+    gradients = torch.autograd.grad(
+        outputs=output,
+        inputs=images,
+        grad_outputs=torch.ones(output.size(), device=images.device),
+        create_graph=True,
+        retain_graph=True,
+        only_inputs=True,
+    )[0]
+
+    gradients = rearrange(gradients, "b ... -> b (...)")
+    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()
+
+
+class VAELoss(nn.Module):
+    def __init__(
+        self,
+        logvar_init=0.0,
+        perceptual_loss_weight=1.0,
+        kl_loss_weight=5e-4,
+        device="cpu",
+        dtype="bf16",
+    ):
+        super().__init__()
+
+        if type(dtype) == str:
+            if dtype == "bf16":
+                dtype = torch.bfloat16
+            elif dtype == "fp16":
+                dtype = torch.float16
+            elif dtype == "fp32":
+                dtype = torch.float32
+            else:
+                raise NotImplementedError(f"dtype: {dtype}")
+
+        # KL Loss
+        self.kl_weight = kl_loss_weight
+        # Perceptual Loss
+        self.perceptual_loss_fn = LPIPS().eval().to(device, dtype)
+        self.perceptual_loss_fn.requires_grad_(False)
+        self.perceptual_loss_weight = perceptual_loss_weight
+        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
+
+    def forward(
+        self,
+        video,
+        recon_video,
+        posterior,
+    ) -> dict:
+        video.size(0)
+        video = rearrange(video, "b c t h w -> (b t) c h w").contiguous()
+        recon_video = rearrange(recon_video, "b c t h w -> (b t) c h w").contiguous()
+
+        # reconstruction loss
+        recon_loss = l1(video, recon_video)
+
+        # perceptual loss
+        perceptual_loss = self.perceptual_loss_fn(video, recon_video)
+        # nll loss (from reconstruction loss and perceptual loss)
+        nll_loss = recon_loss + perceptual_loss * self.perceptual_loss_weight
+        nll_loss = nll_loss / torch.exp(self.logvar) + self.logvar
+
+        # Batch Mean
+        nll_loss = batch_mean(nll_loss)
+        recon_loss = batch_mean(recon_loss)
+        numel_elements = video.numel() // video.size(0)
+        perceptual_loss = batch_mean(perceptual_loss) * numel_elements
+
+        # KL Loss
+        if posterior is None:
+            kl_loss = torch.tensor(0.0).to(video.device, video.dtype)
+        else:
+            kl_loss = posterior.kl()
+            kl_loss = batch_mean(kl_loss)
+        weighted_kl_loss = kl_loss * self.kl_weight
+
+        return {
+            "nll_loss": nll_loss,
+            "kl_loss": weighted_kl_loss,
+            "recon_loss": recon_loss,
+            "perceptual_loss": perceptual_loss,
+        }
+
+
+class GeneratorLoss(nn.Module):
+    def __init__(self, gen_start=2001, disc_factor=1.0, disc_weight=0.5):
+        super().__init__()
+        self.disc_factor = disc_factor
+        self.gen_start = gen_start
+        self.disc_weight = disc_weight
+
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer):
+        nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+        g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.disc_weight
+        return d_weight
+
+    def forward(
+        self,
+        logits_fake,
+        nll_loss,
+        last_layer,
+        global_step,
+        is_training=True,
+    ):
+        g_loss = -torch.mean(logits_fake)
+
+        if self.disc_factor is not None and self.disc_factor > 0.0:
+            d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer)
+        else:
+            d_weight = torch.tensor(1.0)
+
+        disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.gen_start)
+        weighted_gen_loss = d_weight * disc_factor * g_loss
+
+        return weighted_gen_loss, g_loss
+
+
+class DiscriminatorLoss(nn.Module):
+    def __init__(self, disc_start=2001, disc_factor=1.0, disc_loss_type="hinge"):
+        super().__init__()
+
+        assert disc_loss_type in ["hinge", "vanilla", "wgan-gp"]
+        self.disc_factor = disc_factor
+        self.disc_start = disc_start
+        self.disc_loss_type = disc_loss_type
+
+        if self.disc_loss_type == "hinge":
+            self.loss_fn = hinge_d_loss
+        elif self.disc_loss_type == "vanilla":
+            self.loss_fn = vanilla_d_loss
+        elif self.disc_loss_type == "wgan-gp":
+            self.loss_fn = wgan_gp_loss
+        else:
+            raise ValueError(f"Unknown GAN loss '{self.disc_loss_type}'.")
+
+    def forward(
+        self,
+        real_logits,
+        fake_logits,
+        global_step,
+    ):
+        if self.disc_factor is not None and self.disc_factor > 0.0:
+            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.disc_start)
+            disc_loss = self.loss_fn(real_logits, fake_logits)
+            weighted_discriminator_loss = disc_factor * disc_loss
+        else:
+            weighted_discriminator_loss = 0
+
+        return weighted_discriminator_loss
diff --git a/opensora/models/vae/lpips.py b/opensora/models/vae/lpips.py
new file mode 100644
index 0000000..784ffb1
--- /dev/null
+++ b/opensora/models/vae/lpips.py
@@ -0,0 +1,186 @@
+import hashlib
+import os
+from collections import namedtuple
+
+import requests
+import torch
+import torch.nn as nn
+from torchvision import models
+from tqdm import tqdm
+
+from opensora.acceleration.checkpoint import checkpoint
+
+URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"}
+
+CKPT_MAP = {"vgg_lpips": "vgg.pth"}
+
+MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"}
+
+
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+
+
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+
+
+def get_ckpt_path(name, root=".", check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+
+
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def load_from_pretrained(self, name="vgg_lpips"):
+        path = os.path.expanduser("~/.cache/opensora/taming/modules/autoencoder/lpips")
+        ckpt = get_ckpt_path(name, path)
+        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+
+    @classmethod
+    def from_pretrained(cls, name="vgg_lpips"):
+        if name != "vgg_lpips":
+            raise NotImplementedError
+        model = cls()
+        ckpt = get_ckpt_path(name)
+        model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+        return model
+
+    def forward_old(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+
+        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+
+    def get_layer_loss(self, input, target, i):
+        input, target = getattr(self.net, f"slice{i+1}")(input), getattr(self.net, f"slice{i+1}")(target)
+        feats0, feats1 = normalize_tensor(input), normalize_tensor(target)
+        diff = (feats0 - feats1) ** 2
+        avg = spatial_average(self.lins[i].model(diff), keepdim=True)
+        return avg, input, target
+
+    def forward(self, input, target):
+        input, target = (self.scaling_layer(input), self.scaling_layer(target))
+
+        val = None
+        for i in range(len(self.chns)):
+            avg, input, target = checkpoint(self.get_layer_loss, input, target, i, use_reentrant=False)
+            val = avg if val is None else val + avg
+        return val
+
+
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None])
+        self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None])
+
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+
+
+class NetLinLayer(nn.Module):
+    """A single linear layer which does a 1x1 conv"""
+
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = (
+            [
+                nn.Dropout(),
+            ]
+            if (use_dropout)
+            else []
+        )
+        layers += [
+            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
+        ]
+        self.model = nn.Sequential(*layers)
+
+
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+
+
+def normalize_tensor(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+
+
+def spatial_average(x, keepdim=True):
+    return x.mean([2, 3], keepdim=keepdim)
diff --git a/opensora/models/vae/tensor_parallel.py b/opensora/models/vae/tensor_parallel.py
new file mode 100644
index 0000000..f423cce
--- /dev/null
+++ b/opensora/models/vae/tensor_parallel.py
@@ -0,0 +1,558 @@
+from typing import List, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.shardformer.layer._operation import (
+    gather_forward_split_backward,
+    reduce_forward,
+    split_forward_gather_backward,
+)
+from colossalai.shardformer.layer.parallel_module import ParallelModule
+from colossalai.tensor.d_tensor.api import (
+    distribute_tensor,
+    is_distributed_tensor,
+    shard_rowwise,
+    sharded_tensor_to_existing_param,
+)
+from colossalai.tensor.d_tensor.sharding_spec import ShardingSpec
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+
+from .utils import ChannelChunkConv3d, channel_chunk_conv3d
+
+
+def shard_channelwise(
+    tensor: torch.Tensor, group_or_device_mesh: Union[ProcessGroup, DeviceMesh] = None
+) -> torch.Tensor:
+    """
+    Shard the second dim of the given tensor.
+
+    Args:
+        tensor (torch.Tensor): The tensor to be sharded.
+        group_or_device_mesh (Union[ProcessGroup, DeviceMesh], optional): The group or device mesh to shard the tensor.
+            If None, the tensor will be sharded with respect to the global process group.
+            Defaults to None.
+        inplace (bool, optional): Whether to shard the tensor in-place. Defaults to False.
+
+    Returns:
+        torch.Tensor: The sharded tensor.
+    """
+    # if the group_or_device_mesh is None, we shard the tensor with respect to the global process group
+    if group_or_device_mesh is None:
+        group_or_device_mesh = dist.GroupMember.WORLD
+
+    if isinstance(group_or_device_mesh, ProcessGroup):
+        device_mesh = DeviceMesh.from_process_group(group_or_device_mesh)
+    else:
+        assert len(group_or_device_mesh.shape) == 1, "Only 1D DeviceMesh is accepted for row-wise sharding."
+        device_mesh = group_or_device_mesh
+    sharding_spec = ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={1: [0]})
+
+    return distribute_tensor(tensor, device_mesh, sharding_spec)
+
+
+class Conv3dTPCol(nn.Conv3d):
+    """Conv3d with column-wise tensor parallelism. This is only for inference."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+        tp_group=None,
+        gather_output: bool = False,
+        weight: Optional[Parameter] = None,
+        bias_: Optional[Parameter] = None,
+    ) -> None:
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode, device, dtype
+        )
+        self.tp_group = tp_group
+        self.gather_output = gather_output
+        self.tp_size = dist.get_world_size(tp_group)
+        self.tp_rank = dist.get_rank(tp_group)
+
+        # sanity check
+        if weight is not None:
+            assert not bias or bias_ is not None, "bias_ must be provided if bias is True when weight is not None"
+        else:
+            assert bias_ is None, "bias_ must be None if weight is None"
+
+        # Parameters.
+        if weight is None:
+            assert weight is not None, "weight must be provided"
+        else:
+            weight.data = weight.data.to(device=device, dtype=dtype)
+            self.weight = weight
+
+        if not is_distributed_tensor(self.weight):
+            sharded_weight = shard_rowwise(self.weight.data, self.tp_group)
+            sharded_tensor_to_existing_param(sharded_weight, self.weight)
+
+        if bias:
+            if bias_ is None:
+                assert bias is not None, "bias must be provided"
+            else:
+                bias_.data = bias_.data.to(device=device, dtype=dtype)
+                self.bias = bias_
+            if not is_distributed_tensor(self.bias):
+                sharded_bias = shard_rowwise(self.bias.data, self.tp_group)
+                sharded_tensor_to_existing_param(sharded_bias, self.bias)
+        else:
+            self.bias = None
+
+    @staticmethod
+    def from_native_module(
+        module: nn.Conv3d, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
+    ) -> ParallelModule:
+        r"""
+        Convert a native PyTorch conv3d layer to a tensor parallelized layer.
+        """
+
+        # ensure only one process group is passed
+        if isinstance(process_group, (list, tuple)):
+            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
+            process_group = process_group[0]
+
+        conv3d_tp = Conv3dTPCol(
+            in_channels=module.in_channels,
+            out_channels=module.out_channels,
+            kernel_size=module.kernel_size,
+            stride=module.stride,
+            padding=module.padding,
+            dilation=module.dilation,
+            groups=module.groups,
+            bias=module.bias is not None,
+            padding_mode=module.padding_mode,
+            device=module.weight.device,
+            dtype=module.weight.dtype,
+            tp_group=process_group,
+            weight=module.weight,
+            bias_=module.bias,
+            **kwargs,
+        )
+        return conv3d_tp
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        weight = self.weight
+        bias = None
+        if self.bias is not None:
+            bias = self.bias
+        out = channel_chunk_conv3d(
+            input,
+            weight,
+            bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            ChannelChunkConv3d.CONV3D_NUMEL_LIMIT,
+        )
+        if not self.gather_output:
+            return out
+        gathered_out = gather_forward_split_backward(out, 1, self.tp_group)
+        return gathered_out
+
+
+class Conv3dTPRow(nn.Conv3d):
+    """Conv3d with row-wise tensor parallelism. This is only for inference."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+        tp_group=None,
+        split_input: bool = False,
+        split_output: bool = False,
+        weight: Optional[Parameter] = None,
+        bias_: Optional[Parameter] = None,
+    ) -> None:
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode, device, dtype
+        )
+        self.tp_group = tp_group
+        self.split_input = split_input
+        self.split_output = split_output
+        self.tp_size = dist.get_world_size(tp_group)
+        self.tp_rank = dist.get_rank(tp_group)
+
+        # sanity check
+        if weight is not None:
+            assert not bias or bias_ is not None, "bias_ must be provided if bias is True when weight is not None"
+        else:
+            assert bias_ is None, "bias_ must be None if weight is None"
+
+        # Parameters.
+        if weight is None:
+            assert weight is not None, "weight must be provided"
+        else:
+            weight.data = weight.data.to(device=device, dtype=dtype)
+            self.weight = weight
+
+        if not is_distributed_tensor(self.weight):
+            sharded_weight = shard_channelwise(self.weight.data, self.tp_group)
+            sharded_tensor_to_existing_param(sharded_weight, self.weight)
+
+        if bias:
+            if bias_ is None:
+                assert bias is not None, "bias must be provided"
+            else:
+                bias_.data = bias_.data.to(device=device, dtype=dtype)
+                self.bias = bias_
+        else:
+            self.bias = None
+
+    @staticmethod
+    def from_native_module(
+        module: nn.Conv3d, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
+    ) -> ParallelModule:
+        r"""
+        Convert a native PyTorch conv3d layer to a tensor parallelized layer.
+        """
+
+        conv3d_tp = Conv3dTPRow(
+            in_channels=module.in_channels,
+            out_channels=module.out_channels,
+            kernel_size=module.kernel_size,
+            stride=module.stride,
+            padding=module.padding,
+            dilation=module.dilation,
+            groups=module.groups,
+            bias=module.bias is not None,
+            padding_mode=module.padding_mode,
+            device=module.weight.device,
+            dtype=module.weight.dtype,
+            tp_group=process_group,
+            weight=module.weight,
+            bias_=module.bias,
+            **kwargs,
+        )
+
+        return conv3d_tp
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.split_input:
+            input = split_forward_gather_backward(input, 1, self.tp_group)
+        weight = self.weight
+        out = channel_chunk_conv3d(
+            input,
+            weight,
+            None,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            ChannelChunkConv3d.CONV3D_NUMEL_LIMIT,
+        )
+        # del input
+        out = reduce_forward(out, self.tp_group)
+        if self.bias is not None:
+            out = out + self.bias[:, None, None, None]
+        if self.split_output:
+            out = split_forward_gather_backward(out, 1, self.tp_group)
+        return out
+
+
+class Conv2dTPRow(nn.Conv2d):
+    """Conv2d with row-wise tensor parallelism. This is only for inference."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+        tp_group=None,
+        split_input: bool = False,
+        split_output: bool = False,
+        weight: Optional[Parameter] = None,
+        bias_: Optional[Parameter] = None,
+    ) -> None:
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode, device, dtype
+        )
+        self.tp_group = tp_group
+        self.split_input = split_input
+        self.split_output = split_output
+        self.tp_size = dist.get_world_size(tp_group)
+        self.tp_rank = dist.get_rank(tp_group)
+
+        # sanity check
+        if weight is not None:
+            assert not bias or bias_ is not None, "bias_ must be provided if bias is True when weight is not None"
+        else:
+            assert bias_ is None, "bias_ must be None if weight is None"
+
+        # Parameters.
+        if weight is None:
+            assert weight is not None, "weight must be provided"
+        else:
+            weight.data = weight.data.to(device=device, dtype=dtype)
+            self.weight = weight
+
+        if not is_distributed_tensor(self.weight):
+            sharded_weight = shard_channelwise(self.weight.data, self.tp_group)
+            sharded_tensor_to_existing_param(sharded_weight, self.weight)
+
+        if bias:
+            if bias_ is None:
+                assert bias is not None, "bias must be provided"
+            else:
+                bias_.data = bias_.data.to(device=device, dtype=dtype)
+                self.bias = bias_
+        else:
+            self.bias = None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.split_input:
+            input = split_forward_gather_backward(input, 1, self.tp_group)
+        weight = self.weight
+        out = F.conv2d(
+            input,
+            weight,
+            None,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+        # del input
+        dist.all_reduce(out, group=self.tp_group)
+        if self.bias is not None:
+            out += self.bias[:, None, None]
+        if self.split_output:
+            out = split_forward_gather_backward(out, 1, self.tp_group)
+        return out
+
+    @staticmethod
+    def from_native_module(
+        module: nn.Conv2d, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
+    ) -> ParallelModule:
+        r"""
+        Convert a native PyTorch conv2d layer to a tensor parallelized layer.
+        """
+
+        conv2d_tp = Conv2dTPRow(
+            in_channels=module.in_channels,
+            out_channels=module.out_channels,
+            kernel_size=module.kernel_size,
+            stride=module.stride,
+            padding=module.padding,
+            dilation=module.dilation,
+            groups=module.groups,
+            bias=module.bias is not None,
+            padding_mode=module.padding_mode,
+            device=module.weight.device,
+            dtype=module.weight.dtype,
+            tp_group=process_group,
+            weight=module.weight,
+            bias_=module.bias,
+            **kwargs,
+        )
+        conv2d_tp.weight = module.weight
+        conv2d_tp.bias = module.bias
+        return conv2d_tp
+
+
+class Conv1dTPRow(nn.Conv1d):
+    """Conv1d with row-wise tensor parallelism. This is only for inference."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+        tp_group=None,
+        split_input: bool = False,
+        split_output: bool = False,
+        weight: Optional[Parameter] = None,
+        bias_: Optional[Parameter] = None,
+    ) -> None:
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode, device, dtype
+        )
+        self.tp_group = tp_group
+        self.split_input = split_input
+        self.split_output = split_output
+        self.tp_size = dist.get_world_size(tp_group)
+        self.tp_rank = dist.get_rank(tp_group)
+
+        # sanity check
+        if weight is not None:
+            assert not bias or bias_ is not None, "bias_ must be provided if bias is True when weight is not None"
+        else:
+            assert bias_ is None, "bias_ must be None if weight is None"
+
+        # Parameters.
+        if weight is None:
+            assert weight is not None, "weight must be provided"
+        else:
+            weight.data = weight.data.to(device=device, dtype=dtype)
+            self.weight = weight
+
+        if not is_distributed_tensor(self.weight):
+            sharded_weight = shard_channelwise(self.weight.data, self.tp_group)
+            sharded_tensor_to_existing_param(sharded_weight, self.weight)
+
+        if bias:
+            if bias_ is None:
+                assert bias is not None, "bias must be provided"
+            else:
+                bias_.data = bias_.data.to(device=device, dtype=dtype)
+                self.bias = bias_
+        else:
+            self.bias = None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.split_input:
+            input = split_forward_gather_backward(input, 1, self.tp_group)
+
+        weight = self.weight
+        out = F.conv1d(
+            input,
+            weight,
+            None,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+        # del input
+        dist.all_reduce(out, group=self.tp_group)
+        if self.bias is not None:
+            out += self.bias[:, None]
+        if self.split_output:
+            out = split_forward_gather_backward(out, 1, self.tp_group)
+        return out
+
+    @staticmethod
+    def from_native_module(
+        module: nn.Conv1d, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
+    ) -> ParallelModule:
+        r"""
+        Convert a native PyTorch conv1d layer to a tensor parallelized layer.
+        """
+
+        conv1d_tp = Conv1dTPRow(
+            in_channels=module.in_channels,
+            out_channels=module.out_channels,
+            kernel_size=module.kernel_size,
+            stride=module.stride,
+            padding=module.padding,
+            dilation=module.dilation,
+            groups=module.groups,
+            bias=module.bias is not None,
+            padding_mode=module.padding_mode,
+            device=module.weight.device,
+            dtype=module.weight.dtype,
+            tp_group=process_group,
+            weight=module.weight,
+            bias_=module.bias,
+            **kwargs,
+        )
+        conv1d_tp.weight = module.weight
+        conv1d_tp.bias = module.bias
+        return conv1d_tp
+
+
+class GroupNormTP(nn.GroupNorm):
+    def __init__(
+        self,
+        num_groups: int,
+        num_channels: int,
+        eps: float = 0.00001,
+        affine: bool = True,
+        device=None,
+        dtype=None,
+        tp_group=None,
+        weight: Optional[Parameter] = None,
+        bias: Optional[Parameter] = None,
+    ) -> None:
+        super().__init__(num_groups, num_channels, eps, affine, device, dtype)
+        self.tp_group = tp_group
+        self.tp_size = dist.get_world_size(tp_group)
+        self.tp_rank = dist.get_rank(tp_group)
+
+        if affine:
+            assert weight is not None, "weight must be provided"
+            weight.data = weight.data.to(device=device, dtype=dtype)
+            self.weight = weight
+            if not is_distributed_tensor(self.weight):
+                sharded_weight = shard_rowwise(self.weight.data, self.tp_group)
+                sharded_tensor_to_existing_param(sharded_weight, self.weight)
+
+            assert bias is not None, "bias must be provided"
+            bias.data = bias.data.to(device=device, dtype=dtype)
+            self.bias = bias
+            if not is_distributed_tensor(self.bias):
+                sharded_bias = shard_rowwise(self.bias.data, self.tp_group)
+                sharded_tensor_to_existing_param(sharded_bias, self.bias)
+        else:
+            self.weight = None
+            self.bias = None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.group_norm(
+            input,
+            self.num_groups // self.tp_size,
+            self.weight,
+            self.bias,
+            self.eps,
+        )
+
+    @staticmethod
+    def from_native_module(
+        module: nn.GroupNorm, process_group: Union[ProcessGroup, List[ProcessGroup]], **kwargs
+    ) -> ParallelModule:
+        r"""
+        Convert a native PyTorch nn.GroupNorm layer to a tensor parallelized layer.
+        """
+
+        group_norm_tp = GroupNormTP(
+            num_groups=module.num_groups,
+            num_channels=module.num_channels,
+            eps=module.eps,
+            affine=module.affine,
+            device=module.weight.device,
+            dtype=module.weight.dtype,
+            tp_group=process_group,
+            weight=module.weight,
+            bias=module.bias,
+            **kwargs,
+        )
+        return group_norm_tp
diff --git a/opensora/models/vae/utils.py b/opensora/models/vae/utils.py
new file mode 100644
index 0000000..590d0a0
--- /dev/null
+++ b/opensora/models/vae/utils.py
@@ -0,0 +1,257 @@
+import math
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+NUMEL_LIMIT = 2**30
+
+
+def ceil_to_divisible(n: int, dividend: int) -> int:
+    return math.ceil(dividend / (dividend // n))
+
+
+def chunked_avg_pool1d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
+    n_chunks = math.ceil(input.numel() / NUMEL_LIMIT)
+    if n_chunks == 1:
+        return F.avg_pool1d(input, kernel_size, stride, padding, ceil_mode, count_include_pad)
+    else:
+        l_in = input.shape[-1]
+        l_out = math.floor((l_in + 2 * padding - kernel_size) / stride + 1)
+        output_shape = list(input.shape)
+        output_shape[-1] = l_out
+        out_list = []
+
+        for inp_chunk in input.chunk(n_chunks, dim=0):
+            out_chunk = F.avg_pool1d(inp_chunk, kernel_size, stride, padding, ceil_mode, count_include_pad)
+            out_list.append(out_chunk)
+        return torch.cat(out_list, dim=0)
+
+
+def chunked_interpolate(input, scale_factor):
+    output_shape = list(input.shape)
+    output_shape = output_shape[:2] + [int(i * scale_factor) for i in output_shape[2:]]
+    n_chunks = math.ceil(torch.Size(output_shape).numel() / NUMEL_LIMIT)
+    if n_chunks == 1:
+        return F.interpolate(input, scale_factor=scale_factor)
+    else:
+        out_list = []
+        n_chunks += 1
+        for inp_chunk in input.chunk(n_chunks, dim=1):
+            out_chunk = F.interpolate(inp_chunk, scale_factor=scale_factor)
+            out_list.append(out_chunk)
+        return torch.cat(out_list, dim=1)
+
+
+def get_conv3d_output_shape(
+    input_shape: torch.Size, out_channels: int, kernel_size: list, stride: list, padding: int, dilation: list
+) -> list:
+    output_shape = [out_channels]
+    if len(input_shape) == 5:
+        output_shape.insert(0, input_shape[0])
+    for i, d in enumerate(input_shape[-3:]):
+        d_out = math.floor((d + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1)
+        output_shape.append(d_out)
+    return output_shape
+
+
+def get_conv3d_n_chunks(numel: int, n_channels: int, numel_limit: int):
+    n_chunks = math.ceil(numel / numel_limit)
+    n_chunks = ceil_to_divisible(n_chunks, n_channels)
+    return n_chunks
+
+
+def channel_chunk_conv3d(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: list,
+    padding: list,
+    dilation: list,
+    groups: int,
+    numel_limit: int,
+):
+    out_channels, in_channels = weight.shape[:2]
+    kernel_size = weight.shape[2:]
+    output_shape = get_conv3d_output_shape(input.shape, out_channels, kernel_size, stride, padding, dilation)
+    n_in_chunks = get_conv3d_n_chunks(input.numel(), in_channels, numel_limit)
+    n_out_chunks = get_conv3d_n_chunks(
+        np.prod(output_shape),
+        out_channels,
+        numel_limit,
+    )
+    if n_in_chunks == 1 and n_out_chunks == 1:
+        return F.conv3d(input, weight, bias, stride, padding, dilation, groups)
+    # output = torch.empty(output_shape, device=input.device, dtype=input.dtype)
+    # outputs = output.chunk(n_out_chunks, dim=1)
+    input_shards = input.chunk(n_in_chunks, dim=1)
+    weight_chunks = weight.chunk(n_out_chunks)
+    output_list = []
+    if bias is not None:
+        bias_chunks = bias.chunk(n_out_chunks)
+    else:
+        bias_chunks = [None] * n_out_chunks
+    for weight_, bias_ in zip(weight_chunks, bias_chunks):
+        weight_shards = weight_.chunk(n_in_chunks, dim=1)
+        o = None
+        for x, w in zip(input_shards, weight_shards):
+            if o is None:
+                o = F.conv3d(x, w, None, stride, padding, dilation, groups).float()
+            else:
+                o += F.conv3d(x, w, None, stride, padding, dilation, groups).float()
+        o = o.to(input.dtype)
+        if bias_ is not None:
+            o += bias_[None, :, None, None, None]
+        # inplace operation cannot be used during training
+        # output_.copy_(o)
+        output_list.append(o)
+    return torch.cat(output_list, dim=1)
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(
+        self,
+        parameters,
+        deterministic=False,
+    ):
+        """Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device, dtype=self.mean.dtype)
+
+    def sample(self):
+        # torch.randn: standard normal distribution
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device, dtype=self.mean.dtype)
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:  # SCH: assumes other is a standard normal distribution
+                return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 3, 4]).flatten(0)
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 3, 4],
+                ).flatten(0)
+
+    def mode(self):
+        return self.mean
+
+
+class ChannelChunkConv3d(nn.Conv3d):
+    CONV3D_NUMEL_LIMIT = 2**31
+
+    def _get_output_numel(self, input_shape: torch.Size) -> int:
+        numel = self.out_channels
+        if len(input_shape) == 5:
+            numel *= input_shape[0]
+        for i, d in enumerate(input_shape[-3:]):
+            d_out = math.floor(
+                (d + 2 * self.padding[i] - self.dilation[i] * (self.kernel_size[i] - 1) - 1) / self.stride[i] + 1
+            )
+            numel *= d_out
+        return numel
+
+    def _get_n_chunks(self, numel: int, n_channels: int):
+        n_chunks = math.ceil(numel / ChannelChunkConv3d.CONV3D_NUMEL_LIMIT)
+        n_chunks = ceil_to_divisible(n_chunks, n_channels)
+        return n_chunks
+
+    def forward(self, input: Tensor) -> Tensor:
+        if input.numel() // input.size(0) < ChannelChunkConv3d.CONV3D_NUMEL_LIMIT:
+            return super().forward(input)
+        n_in_chunks = self._get_n_chunks(input.numel(), self.in_channels)
+        n_out_chunks = self._get_n_chunks(self._get_output_numel(input.shape), self.out_channels)
+        if n_in_chunks == 1 and n_out_chunks == 1:
+            return super().forward(input)
+        outputs = []
+        input_shards = input.chunk(n_in_chunks, dim=1)
+        for weight, bias in zip(self.weight.chunk(n_out_chunks), self.bias.chunk(n_out_chunks)):
+            weight_shards = weight.chunk(n_in_chunks, dim=1)
+            o = None
+            for x, w in zip(input_shards, weight_shards):
+                if o is None:
+                    o = F.conv3d(x, w, bias, self.stride, self.padding, self.dilation, self.groups)
+                else:
+                    o += F.conv3d(x, w, None, self.stride, self.padding, self.dilation, self.groups)
+            outputs.append(o)
+        return torch.cat(outputs, dim=1)
+
+
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
+def pad_for_conv3d(x: torch.Tensor, width_pad: int, height_pad: int, time_pad: int) -> torch.Tensor:
+    if width_pad > 0 or height_pad > 0:
+        x = F.pad(x, (width_pad, width_pad, height_pad, height_pad), mode="constant", value=0)
+    if time_pad > 0:
+        x = F.pad(x, (0, 0, 0, 0, time_pad, time_pad), mode="replicate")
+    return x
+
+
+def pad_for_conv3d_kernel_3x3x3(x: torch.Tensor) -> torch.Tensor:
+    n_chunks = math.ceil(x.numel() / NUMEL_LIMIT)
+    if n_chunks == 1:
+        x = F.pad(x, (1, 1, 1, 1), mode="constant", value=0)
+        x = F.pad(x, (0, 0, 0, 0, 1, 1), mode="replicate")
+    else:
+        out_list = []
+        n_chunks += 1
+        for inp_chunk in x.chunk(n_chunks, dim=1):
+            out_chunk = F.pad(inp_chunk, (1, 1, 1, 1), mode="constant", value=0)
+            out_chunk = F.pad(out_chunk, (0, 0, 0, 0, 1, 1), mode="replicate")
+            out_list.append(out_chunk)
+        x = torch.cat(out_list, dim=1)
+    return x
+
+
+class PadConv3D(nn.Module):
+    """
+    pad the first frame in temporal dimension
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3):
+        super().__init__()
+
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+        self.kernel_size = kernel_size
+
+        # == specific padding ==
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        assert time_kernel_size == height_kernel_size == width_kernel_size, "only support cubic kernel size"
+        if time_kernel_size == 3:
+            self.pad = pad_for_conv3d_kernel_3x3x3
+        else:
+            assert time_kernel_size == 1, f"only support kernel size 1/3 for now, got {kernel_size}"
+            self.pad = lambda x: x
+
+        self.conv = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.pad(x)
+        x = self.conv(x)
+        return x
+
+
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
+class ChannelChunkPadConv3D(PadConv3D):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 3):
+        super().__init__(in_channels, out_channels, kernel_size)
+        self.conv = ChannelChunkConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1)
diff --git a/opensora/registry.py b/opensora/registry.py
new file mode 100644
index 0000000..5117eaf
--- /dev/null
+++ b/opensora/registry.py
@@ -0,0 +1,41 @@
+from copy import deepcopy
+
+import torch.nn as nn
+from mmengine.registry import Registry
+
+
+def build_module(module: dict | nn.Module, builder: Registry, **kwargs) -> nn.Module | None:
+    """Build module from config or return the module itself.
+
+    Args:
+        module (dict | nn.Module): The module to build.
+        builder (Registry): The registry to build module.
+        *args, **kwargs: Arguments passed to build function.
+
+    Returns:
+        (None | nn.Module): The created model.
+    """
+    if module is None:
+        return None
+    if isinstance(module, dict):
+        cfg = deepcopy(module)
+        for k, v in kwargs.items():
+            cfg[k] = v
+        return builder.build(cfg)
+    elif isinstance(module, nn.Module):
+        return module
+    elif module is None:
+        return None
+    else:
+        raise TypeError(f"Only support dict and nn.Module, but got {type(module)}.")
+
+
+MODELS = Registry(
+    "model",
+    locations=["opensora.models"],
+)
+
+DATASETS = Registry(
+    "dataset",
+    locations=["opensora.datasets"],
+)
diff --git a/opensora/utils/__init__.py b/opensora/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/opensora/utils/cai.py b/opensora/utils/cai.py
new file mode 100644
index 0000000..090ba78
--- /dev/null
+++ b/opensora/utils/cai.py
@@ -0,0 +1,91 @@
+import colossalai
+import torch
+import torch.distributed as dist
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+
+from opensora.acceleration.parallel_states import (
+    get_sequence_parallel_group,
+    get_tensor_parallel_group,
+    set_sequence_parallel_group,
+)
+from opensora.models.hunyuan_vae.policy import HunyuanVaePolicy
+from opensora.models.mmdit.distributed import MMDiTPolicy
+from opensora.utils.logger import is_distributed
+from opensora.utils.train import create_colossalai_plugin
+
+from .logger import log_message
+
+
+def set_group_size(plugin_config: dict):
+    """
+    Set the group size for tensor parallelism and sequence parallelism.
+
+    Args:
+        plugin_config (dict): Plugin configuration.
+    """
+    tp_size = int(plugin_config.get("tp_size", 1))
+    sp_size = int(plugin_config.get("sp_size", 1))
+    if tp_size > 1:
+        assert sp_size == 1
+        plugin_config["tp_size"] = tp_size = min(tp_size, torch.cuda.device_count())
+        log_message(f"Using TP with size {tp_size}")
+    if sp_size > 1:
+        assert tp_size == 1
+        plugin_config["sp_size"] = sp_size = min(sp_size, torch.cuda.device_count())
+        log_message(f"Using SP with size {sp_size}")
+
+
+def init_inference_environment():
+    """
+    Initialize the inference environment.
+    """
+    if is_distributed():
+        colossalai.launch_from_torch({})
+        coordinator = DistCoordinator()
+        enable_sequence_parallelism = coordinator.world_size > 1
+        if enable_sequence_parallelism:
+            set_sequence_parallel_group(dist.group.WORLD)
+
+
+def get_booster(cfg: dict, ae: bool = False):
+    suffix = "_ae" if ae else ""
+    policy = HunyuanVaePolicy if ae else MMDiTPolicy
+
+    plugin_type = cfg.get(f"plugin{suffix}", "zero2")
+    plugin_config = cfg.get(f"plugin_config{suffix}", {})
+    plugin_kwargs = {}
+    booster = None
+    if plugin_type == "hybrid":
+        set_group_size(plugin_config)
+        plugin_kwargs = dict(custom_policy=policy)
+
+        plugin = create_colossalai_plugin(
+            plugin=plugin_type,
+            dtype=cfg.get("dtype", "bf16"),
+            grad_clip=cfg.get("grad_clip", 0),
+            **plugin_config,
+            **plugin_kwargs,
+        )
+        booster = Booster(plugin=plugin)
+    return booster
+
+
+def get_is_saving_process(cfg: dict):
+    """
+    Check if the current process is the one that saves the model.
+
+    Args:
+        plugin_config (dict): Plugin configuration.
+
+    Returns:
+        bool: True if the current process is the one that saves the model.
+    """
+    plugin_type = cfg.get("plugin", "zero2")
+    plugin_config = cfg.get("plugin_config", {})
+    is_saving_process = (
+        plugin_type != "hybrid"
+        or (plugin_config["tp_size"] > 1 and dist.get_rank(get_tensor_parallel_group()) == 0)
+        or (plugin_config["sp_size"] > 1 and dist.get_rank(get_sequence_parallel_group()) == 0)
+    )
+    return is_saving_process
diff --git a/opensora/utils/ckpt.py b/opensora/utils/ckpt.py
new file mode 100644
index 0000000..1065a27
--- /dev/null
+++ b/opensora/utils/ckpt.py
@@ -0,0 +1,524 @@
+import functools
+import json
+import operator
+import os
+import re
+import shutil
+from glob import glob
+from typing import Dict, Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from colossalai.booster import Booster
+from colossalai.checkpoint_io import GeneralCheckpointIO
+from colossalai.utils.safetensors import save as async_save
+from colossalai.zero.low_level import LowLevelZeroOptimizer
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from tensornvme.async_file_io import AsyncFileWriter
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+from opensora.acceleration.parallel_states import get_data_parallel_group
+
+from .logger import log_message
+
+hf_endpoint = os.environ.get("HF_ENDPOINT")
+if hf_endpoint is None:
+    hf_endpoint = "https://huggingface.co"
+os.environ["TENSORNVME_DEBUG"] = "1"
+
+
+def load_from_hf_hub(repo_path: str, cache_dir: str = None) -> str:
+    """
+    Loads a checkpoint from the Hugging Face Hub.
+
+    Args:
+        repo_path (str): The path to the checkpoint on the Hugging Face Hub.
+        cache_dir (str): The directory to cache the downloaded checkpoint.
+
+    Returns:
+        str: The path to the downloaded checkpoint.
+    """
+    repo_id = "/".join(repo_path.split("/")[:-1])
+    repo_file = repo_path.split("/")[-1]
+    ckpt_path = hf_hub_download(repo_id=repo_id, filename=repo_file, cache_dir=cache_dir)
+    return ckpt_path
+
+
+def load_from_sharded_state_dict(model: nn.Module, ckpt_path: str, model_name: str = "model", strict=False):
+    """
+    Loads a model from a sharded checkpoint.
+
+    Args:
+        model (nn.Module): The model to load the checkpoint into.
+        ckpt_path (str): The path to the checkpoint.
+        model_name (str): The name of the model in the checkpoint.
+        strict (bool): Whether to strictly enforce that the keys in the checkpoint match the keys in the model.
+    """
+    ckpt_io = GeneralCheckpointIO()
+    ckpt_io.load_model(model, os.path.join(ckpt_path, model_name), strict=strict)
+
+
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    """
+    Prints a warning if there are missing or unexpected keys when loading a model.
+
+    Args:
+        missing (list[str]): The missing keys.
+        unexpected (list[str]): The unexpected keys.
+    """
+    if len(missing) > 0 and len(unexpected) > 0:
+        log_message(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        log_message("\n" + "-" * 79 + "\n")
+        log_message(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    elif len(missing) > 0:
+        log_message(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        log_message(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    else:
+        log_message("Model loaded successfully")
+
+
+def load_checkpoint(
+    model: nn.Module,
+    path: str,
+    cache_dir: str = None,
+    device_map: torch.device | str = "cpu",
+    cai_model_name: str = "model",
+    strict: bool = False,
+    rename_keys: dict = None,  # rename keys in the checkpoint to support fine-tuning with a different model architecture; map old_key_prefix to new_key_prefix
+) -> nn.Module:
+    """
+    Loads a checkpoint into model from a path. Support three types of checkpoints:
+        1. huggingface safetensors
+        2. local .pt or .pth
+        3. colossalai sharded checkpoint
+
+    Args:
+        model (nn.Module): The model to load the checkpoint into.
+        path (str): The path to the checkpoint.
+        cache_dir (str): The directory to cache the downloaded checkpoint.
+        device_map (torch.device | str): The device to map the checkpoint to.
+        cai_model_name (str): The name of the model in the checkpoint.
+
+    Returns:
+        nn.Module: The model with the loaded checkpoint.
+    """
+    if not os.path.exists(path):
+        log_message(f"Checkpoint not found at {path}, trying to download from Hugging Face Hub")
+        path = load_from_hf_hub(path, cache_dir)
+    assert os.path.exists(path), f"Could not find checkpoint at {path}"
+
+    log_message(f"Loading checkpoint from {path}")
+    if path.endswith(".safetensors"):
+        ckpt = load_file(path, device='cpu')
+
+        if rename_keys is not None:
+            # rename keys in the loaded state_dict with old_key_prefix to with new_key_prefix.
+            renamed_ckpt = {}
+            for old_key, v in ckpt.items():
+                new_key = old_key
+                for old_key_prefix, new_key_prefix in rename_keys.items():
+                    if old_key_prefix in old_key:
+                        new_key = old_key.replace(old_key_prefix, new_key_prefix)
+                        print(f"Renamed {old_key} to {new_key} in the loaded state_dict")
+                        break
+                renamed_ckpt[new_key] = v
+            ckpt = renamed_ckpt
+
+        missing, unexpected = model.load_state_dict(ckpt, strict=strict)
+        print_load_warning(missing, unexpected)
+    elif path.endswith(".pt") or path.endswith(".pth"):
+        ckpt = torch.load(path, map_location=device_map)
+        missing, unexpected = model.load_state_dict(ckpt, strict=strict)
+        print_load_warning(missing, unexpected)
+    else:
+        assert os.path.isdir(path), f"Invalid checkpoint path: {path}"
+        load_from_sharded_state_dict(model, path, model_name=cai_model_name, strict=strict)
+    return model
+
+
+def rm_checkpoints(
+    save_dir: str,
+    keep_n_latest: int = 0,
+):
+    """
+    Remove old checkpoints.
+
+    Args:
+        save_dir (str): The directory to save the checkpoints.
+        keep_n_latest (int): The number of latest checkpoints to keep.
+    """
+    if keep_n_latest <= 0 or dist.get_rank() != 0:
+        return
+    files = glob(os.path.join(save_dir, "epoch*-global_step*"))
+    files = sorted(
+        files, key=lambda s: tuple(map(int, re.search(r"epoch(\d+)-global_step(\d+)", s).groups())), reverse=True
+    )
+    to_remove = files[keep_n_latest:]
+    for f in to_remove:
+        # shutil.rmtree(f)
+        for item in glob(os.path.join(f, "*")):
+            if os.path.isdir(item):
+                dir_name = os.path.basename(item)
+                if dir_name != "eval":
+                    shutil.rmtree(item)
+            else:
+                os.remove(item)
+
+
+def model_sharding(model: torch.nn.Module, device: torch.device = None):
+    """
+    Sharding the model parameters across multiple GPUs.
+
+    Args:
+        model (torch.nn.Module): The model to shard.
+        device (torch.device): The device to shard the model to.
+    """
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    for _, param in model.named_parameters():
+        if device is None:
+            device = param.device
+        padding_size = (world_size - param.numel() % world_size) % world_size
+        if padding_size > 0:
+            padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size])
+        else:
+            padding_param = param.data.view(-1)
+        splited_params = padding_param.split(padding_param.numel() // world_size)
+        splited_params = splited_params[global_rank]
+        param.data = splited_params.to(device)
+
+
+def model_gathering(model: torch.nn.Module, model_shape_dict: dict, pinned_state_dict: dict) -> None:
+    """
+    Gather the model parameters from multiple GPUs.
+
+    Args:
+        model (torch.nn.Module): The model to gather.
+        model_shape_dict (dict): The shape of the model parameters.
+        device (torch.device): The device to gather the model to.
+    """
+    global_rank = dist.get_rank()
+    global_size = dist.get_world_size()
+    params = set()
+    for name, param in model.named_parameters():
+        params.add(name)
+        all_params = [torch.empty_like(param.data) for _ in range(global_size)]
+        dist.all_gather(all_params, param.data, group=dist.group.WORLD)
+        if int(global_rank) == 0:
+            all_params = torch.cat(all_params)
+            gathered_param = remove_padding(all_params, model_shape_dict[name]).view(model_shape_dict[name])
+            pinned_state_dict[name].copy_(gathered_param)
+    if int(global_rank) == 0:
+        for k, v in model.state_dict(keep_vars=True).items():
+            if k not in params:
+                pinned_state_dict[k].copy_(v)
+
+    dist.barrier()
+
+
+def remove_padding(tensor: torch.Tensor, original_shape: tuple) -> torch.Tensor:
+    """
+    Remove padding from a tensor.
+
+    Args:
+        tensor (torch.Tensor): The tensor to remove padding from.
+        original_shape (tuple): The original shape of the tensor.
+    """
+    return tensor[: functools.reduce(operator.mul, original_shape)]
+
+
+def record_model_param_shape(model: torch.nn.Module) -> dict:
+    """
+    Record the shape of the model parameters.
+
+    Args:
+        model (torch.nn.Module): The model to record the parameter shape of.
+
+    Returns:
+        dict: The shape of the model parameters.
+    """
+    param_shape = {}
+    for name, param in model.named_parameters():
+        param_shape[name] = param.shape
+    return param_shape
+
+
+def load_json(file_path: str) -> dict:
+    """
+    Load a JSON file.
+
+    Args:
+        file_path (str): The path to the JSON file.
+
+    Returns:
+        dict: The loaded JSON file.
+    """
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def save_json(data, file_path: str):
+    """
+    Save a dictionary to a JSON file.
+
+    Args:
+        data: The dictionary to save.
+        file_path (str): The path to save the JSON file.
+    """
+    with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4)
+
+
+def _prepare_ema_pinned_state_dict(model: nn.Module, ema_shape_dict: dict):
+    ema_pinned_state_dict = dict()
+    for name, p in model.named_parameters():
+        ema_pinned_state_dict[name] = torch.empty(ema_shape_dict[name], pin_memory=True, device="cpu", dtype=p.dtype)
+    sd = model.state_dict(keep_vars=True)
+    # handle buffers
+    for k, v in sd.items():
+        if k not in ema_pinned_state_dict:
+            ema_pinned_state_dict[k] = torch.empty(v.shape, pin_memory=True, device="cpu", dtype=v.dtype)
+
+    return ema_pinned_state_dict
+
+
+def _search_valid_path(path: str) -> str:
+    if os.path.exists(f"{path}.safetensors"):
+        return f"{path}.safetensors"
+    elif os.path.exists(f"{path}.pt"):
+        return f"{path}.pt"
+    return path
+
+
+def master_weights_gathering(model: torch.nn.Module, optimizer: LowLevelZeroOptimizer, pinned_state_dict: dict) -> None:
+    """
+    Gather the model parameters from multiple GPUs.
+
+    Args:
+        model (torch.nn.Module): The model to gather.
+        model_shape_dict (dict): The shape of the model parameters.
+        device (torch.device): The device to gather the model to.
+    """
+    w2m = optimizer.get_working_to_master_map()
+    for name, param in model.named_parameters():
+        master_p = w2m[id(param)]
+        zero_pg = optimizer.param_to_pg[param]
+        world_size = dist.get_world_size(zero_pg)
+        all_params = [torch.empty_like(master_p) for _ in range(world_size)]
+        dist.all_gather(all_params, master_p, group=zero_pg)
+        if dist.get_rank() == 0:
+            all_params = torch.cat(all_params)
+            gathered_param = remove_padding(all_params, param.shape).view(param.shape)
+            pinned_state_dict[name].copy_(gathered_param)
+
+    dist.barrier()
+
+
+def load_master_weights(model: torch.nn.Module, optimizer: LowLevelZeroOptimizer, state_dict: dict) -> None:
+    pg = get_data_parallel_group(get_mixed_dp_pg=True)
+    world_size = dist.get_world_size(pg)
+    rank = dist.get_rank(pg)
+    w2m = optimizer.get_working_to_master_map()
+    for name, param in model.named_parameters():
+        master_p = w2m[id(param)]
+        state = state_dict[name].view(-1)
+        padding_size = len(master_p) * world_size - len(state)
+        state = torch.nn.functional.pad(state, [0, padding_size])
+        target_chunk = state.chunk(world_size)[rank].to(master_p.dtype)
+        master_p[: len(target_chunk)].copy_(target_chunk)
+
+
+class CheckpointIO:
+    def __init__(self, n_write_entries: int = 32):
+        self.n_write_entries = n_write_entries
+        self.writer: Optional[AsyncFileWriter] = None
+        self.pinned_state_dict: Optional[Dict[str, torch.Tensor]] = None
+        self.master_pinned_state_dict: Optional[Dict[str, torch.Tensor]] = None
+        self.master_writer: Optional[AsyncFileWriter] = None
+
+    def _sync_io(self):
+        if self.writer is not None:
+            self.writer.synchronize()
+            self.writer = None
+        if self.master_writer is not None:
+            self.master_writer.synchronize()
+            self.master_writer = None
+
+    def __del__(self):
+        self._sync_io()
+
+    def _prepare_pinned_state_dict(self, ema: nn.Module, ema_shape_dict: dict):
+        if self.pinned_state_dict is None and dist.get_rank() == 0:
+            self.pinned_state_dict = _prepare_ema_pinned_state_dict(ema, ema_shape_dict)
+
+    def _prepare_master_pinned_state_dict(self, model: nn.Module, optimizer: LowLevelZeroOptimizer):
+        if self.master_pinned_state_dict is None and dist.get_rank() == 0:
+            sd = {}
+            w2m = optimizer.get_working_to_master_map()
+            for n, p in model.named_parameters():
+                master_p = w2m[id(p)]
+                sd[n] = torch.empty(p.shape, dtype=master_p.dtype, pin_memory=True, device="cpu")
+            self.master_pinned_state_dict = sd
+
+    def save(
+        self,
+        booster: Booster,
+        save_dir: str,
+        model: nn.Module = None,
+        ema: nn.Module = None,
+        optimizer: Optimizer = None,
+        lr_scheduler: _LRScheduler = None,
+        sampler=None,
+        epoch: int = None,
+        step: int = None,
+        global_step: int = None,
+        batch_size: int = None,
+        lora: bool = False,
+        actual_update_step: int = None,
+        ema_shape_dict: dict = None,
+        async_io: bool = True,
+        include_master_weights: bool = False,
+    ) -> str:
+        """
+        Save a checkpoint.
+
+        Args:
+            booster (Booster): The Booster object.
+            save_dir (str): The directory to save the checkpoint to.
+            model (nn.Module): The model to save the checkpoint from.
+            ema (nn.Module): The EMA model to save the checkpoint from.
+            optimizer (Optimizer): The optimizer to save the checkpoint from.
+            lr_scheduler (_LRScheduler): The learning rate scheduler to save the checkpoint from.
+            sampler: The sampler to save the checkpoint from.
+            epoch (int): The epoch of the checkpoint.
+            step (int): The step of the checkpoint.
+            global_step (int): The global step of the checkpoint.
+            batch_size (int): The batch size of the checkpoint.
+            lora (bool): Whether the model is trained with LoRA.
+
+        Returns:
+            str: The path to the saved checkpoint
+        """
+        self._sync_io()
+        save_dir = os.path.join(save_dir, f"epoch{epoch}-global_step{actual_update_step}")
+        os.environ["TENSORNVME_DEBUG_LOG"] = os.path.join(save_dir, "async_file_io.log")
+        if model is not None:
+            if not lora:
+                os.makedirs(os.path.join(save_dir, "model"), exist_ok=True)
+                booster.save_model(
+                    model,
+                    os.path.join(save_dir, "model"),
+                    shard=True,
+                    use_safetensors=True,
+                    size_per_shard=4096,
+                    use_async=async_io,
+                )
+            else:
+                os.makedirs(os.path.join(save_dir, "lora"), exist_ok=True)
+                booster.save_lora_as_pretrained(model, os.path.join(save_dir, "lora"))
+        if optimizer is not None:
+            booster.save_optimizer(
+                optimizer, os.path.join(save_dir, "optimizer"), shard=True, size_per_shard=4096, use_async=async_io
+            )
+            if include_master_weights:
+                self._prepare_master_pinned_state_dict(model, optimizer)
+                master_weights_gathering(model, optimizer, self.master_pinned_state_dict)
+        if lr_scheduler is not None:
+            booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
+        if ema is not None:
+            self._prepare_pinned_state_dict(ema, ema_shape_dict)
+            model_gathering(ema, ema_shape_dict, self.pinned_state_dict)
+        if dist.get_rank() == 0:
+            running_states = {
+                "epoch": epoch,
+                "step": step,
+                "global_step": global_step,
+                "batch_size": batch_size,
+                "actual_update_step": actual_update_step,
+            }
+            save_json(running_states, os.path.join(save_dir, "running_states.json"))
+
+            if ema is not None:
+                if async_io:
+                    self.writer = async_save(os.path.join(save_dir, "ema.safetensors"), self.pinned_state_dict)
+                else:
+                    torch.save(ema.state_dict(), os.path.join(save_dir, "ema.pt"))
+
+            if sampler is not None:
+                # only for VariableVideoBatchSampler
+                torch.save(sampler.state_dict(step), os.path.join(save_dir, "sampler"))
+
+            if optimizer is not None and include_master_weights:
+                self.master_writer = async_save(
+                    os.path.join(save_dir, "master.safetensors"), self.master_pinned_state_dict
+                )
+
+        dist.barrier()
+        return save_dir
+
+    def load(
+        self,
+        booster: Booster,
+        load_dir: str,
+        model: nn.Module = None,
+        ema: nn.Module = None,
+        optimizer: Optimizer = None,
+        lr_scheduler: _LRScheduler = None,
+        sampler=None,
+        strict: bool = False,
+        include_master_weights: bool = False,
+    ) -> tuple[int, int]:
+        """
+        Load a checkpoint.
+
+        Args:
+            booster (Booster): The Booster object.
+            load_dir (str): The directory to load the checkpoint from.
+            model (nn.Module): The model to load the checkpoint into.
+            ema (nn.Module): The EMA model to load the checkpoint into.
+            optimizer (Optimizer): The optimizer to load the checkpoint into.
+            lr_scheduler (_LRScheduler): The learning rate scheduler to load the checkpoint into.
+            sampler: The sampler to load the checkpoint into.
+
+        Returns:
+            tuple[int, int]: The epoch and step of the checkpoint.
+        """
+        assert os.path.exists(load_dir), f"Checkpoint directory {load_dir} does not exist"
+        assert os.path.exists(os.path.join(load_dir, "running_states.json")), "running_states.json does not exist"
+
+        running_states = load_json(os.path.join(load_dir, "running_states.json"))
+        if model is not None:
+            booster.load_model(
+                model,
+                _search_valid_path(os.path.join(load_dir, "model")),
+                strict=strict,
+                low_cpu_mem_mode=False,
+                num_threads=32,
+            )
+        if ema is not None:
+            if os.path.exists(os.path.join(load_dir, "ema.safetensors")):
+                ema_state_dict = load_file(os.path.join(load_dir, "ema.safetensors"))
+            else:
+                ema_state_dict = torch.load(os.path.join(load_dir, "ema.pt"), map_location=torch.device("cpu"))
+            # ema is not boosted, so we don't use booster.load_model
+            ema.load_state_dict(ema_state_dict, strict=strict, assign=True)
+
+        if optimizer is not None:
+            booster.load_optimizer(
+                optimizer, os.path.join(load_dir, "optimizer"), low_cpu_mem_mode=False, num_threads=32
+            )
+            if include_master_weights:
+                master_state_dict = load_file(os.path.join(load_dir, "master.safetensors"))
+                load_master_weights(model, optimizer, master_state_dict)
+        if lr_scheduler is not None:
+            booster.load_lr_scheduler(lr_scheduler, os.path.join(load_dir, "lr_scheduler"))
+        if sampler is not None:
+            sampler.load_state_dict(torch.load(os.path.join(load_dir, "sampler")))
+
+        dist.barrier()
+
+        return (running_states["epoch"], running_states["step"])
diff --git a/opensora/utils/config.py b/opensora/utils/config.py
new file mode 100644
index 0000000..770b55a
--- /dev/null
+++ b/opensora/utils/config.py
@@ -0,0 +1,213 @@
+import argparse
+import ast
+import json
+import os
+from datetime import datetime
+
+import torch
+from mmengine.config import Config
+
+from .logger import is_distributed, is_main_process
+
+
+def parse_args() -> tuple[str, argparse.Namespace]:
+    """
+    This function parses the command line arguments.
+
+    Returns:
+        tuple[str, argparse.Namespace]: The path to the configuration file and the command line arguments.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config", type=str, help="model config file path")
+    args, unknown_args = parser.parse_known_args()
+    return args.config, unknown_args
+
+
+def read_config(config_path: str) -> Config:
+    """
+    This function reads the configuration file.
+
+    Args:
+        config_path (str): The path to the configuration file.
+
+    Returns:
+        Config: The configuration object.
+    """
+    cfg = Config.fromfile(config_path)
+    return cfg
+
+
+def parse_configs() -> Config:
+    """
+    This function parses the configuration file and command line arguments.
+
+    Returns:
+        Config: The configuration object.
+    """
+    config, args = parse_args()
+    cfg = read_config(config)
+    cfg = merge_args(cfg, args)
+    cfg.config_path = config
+
+    # hard-coded for spatial compression
+    if cfg.get("ae_spatial_compression", None) is not None:
+        os.environ["AE_SPATIAL_COMPRESSION"] = str(cfg.ae_spatial_compression)
+    return cfg
+
+
+def merge_args(cfg: Config, args: argparse.Namespace) -> Config:
+    """
+    This function merges the configuration file and command line arguments.
+
+    Args:
+        cfg (Config): The configuration object.
+        args (argparse.Namespace): The command line arguments.
+
+    Returns:
+        Config: The configuration object.
+    """
+    for k, v in zip(args[::2], args[1::2]):
+        assert k.startswith("--"), f"Invalid argument: {k}"
+        k = k[2:].replace("-", "_")
+        k_split = k.split(".")
+        target = cfg
+        for key in k_split[:-1]:
+            assert key in cfg, f"Key {key} not found in config"
+            target = target[key]
+        if v.lower() == "none":
+            v = None
+        elif k in target:
+            v_type = type(target[k])
+            if v_type == bool:
+                v = auto_convert(v)
+            else:
+                v = type(target[k])(v)
+        else:
+            v = auto_convert(v)
+        target[k_split[-1]] = v
+    return cfg
+
+
+def auto_convert(value: str) -> int | float | bool | list | dict | None:
+    """
+    Automatically convert a string to the appropriate Python data type,
+    including int, float, bool, list, dict, etc.
+
+    Args:
+        value (str): The string to convert.
+
+    Returns:
+        int, float, bool, list |  dict: The converted value.
+    """
+    # Handle empty string
+    if value == "":
+        return value
+
+    # Handle None
+    if value.lower() == "none":
+        return None
+
+    # Handle boolean values
+    lower_value = value.lower()
+    if lower_value == "true":
+        return True
+    elif lower_value == "false":
+        return False
+
+    # Try to convert the string to an integer or float
+    try:
+        # Try converting to an integer
+        return int(value)
+    except ValueError:
+        pass
+
+    try:
+        # Try converting to a float
+        return float(value)
+    except ValueError:
+        pass
+
+    # Try to convert the string to a list, dict, tuple, etc.
+    try:
+        return ast.literal_eval(value)
+    except (ValueError, SyntaxError):
+        pass
+
+    # If all attempts fail, return the original string
+    return value
+
+
+def sync_string(value: str):
+    """
+    This function synchronizes a string across all processes.
+    """
+    if not is_distributed():
+        return value
+    bytes_value = value.encode("utf-8")
+    max_len = 256
+    bytes_tensor = torch.zeros(max_len, dtype=torch.uint8).cuda()
+    bytes_tensor[: len(bytes_value)] = torch.tensor(
+        list(bytes_value), dtype=torch.uint8
+    )
+    torch.distributed.broadcast(bytes_tensor, 0)
+    synced_value = bytes_tensor.cpu().numpy().tobytes().decode("utf-8").rstrip("\x00")
+    return synced_value
+
+
+def create_experiment_workspace(
+    output_dir: str, model_name: str = None, config: dict = None, exp_name: str = None
+) -> tuple[str, str]:
+    """
+    This function creates a folder for experiment tracking.
+
+    Args:
+        output_dir: The path to the output directory.
+        model_name: The name of the model.
+        exp_name: The given name of the experiment, if None will use default.
+
+    Returns:
+        tuple[str, str]: The experiment name and the experiment directory.
+    """
+    if exp_name is None:
+        # Make outputs folder (holds all experiment subfolders)
+        experiment_index = datetime.now().strftime("%y%m%d_%H%M%S")
+        experiment_index = sync_string(experiment_index)
+        # Create an experiment folder
+        model_name = (
+            "-" + model_name.replace("/", "-") if model_name is not None else ""
+        )
+        exp_name = f"{experiment_index}{model_name}"
+    exp_dir = f"{output_dir}/{exp_name}"
+    if is_main_process():
+        os.makedirs(exp_dir, exist_ok=True)
+        # Save the config
+        with open(f"{exp_dir}/config.txt", "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=4)
+
+    return exp_name, exp_dir
+
+
+def config_to_name(cfg: Config) -> str:
+    filename = cfg._filename
+    filename = filename.replace("configs/", "")
+    filename = filename.replace(".py", "")
+    filename = filename.replace("/", "_")
+    return filename
+
+
+def parse_alias(cfg: Config) -> Config:
+    if cfg.get("resolution", None) is not None:
+        cfg.sampling_option.resolution = cfg.resolution
+    if cfg.get("guidance", None) is not None:
+        cfg.sampling_option.guidance = float(cfg.guidance)
+    if cfg.get("guidance_img", None) is not None:
+        cfg.sampling_option.guidance_img = float(cfg.guidance_img)
+    if cfg.get("num_steps", None) is not None:
+        cfg.sampling_option.num_steps = int(cfg.num_steps)
+    if cfg.get("num_frames", None) is not None:
+        cfg.sampling_option.num_frames = int(cfg.num_frames)
+    if cfg.get("aspect_ratio", None) is not None:
+        cfg.sampling_option.aspect_ratio = cfg.aspect_ratio
+    if cfg.get("ckpt_path", None) is not None:
+        cfg.model.from_pretrained = cfg.ckpt_path
+    return cfg
diff --git a/opensora/utils/inference.py b/opensora/utils/inference.py
new file mode 100644
index 0000000..788ddca
--- /dev/null
+++ b/opensora/utils/inference.py
@@ -0,0 +1,351 @@
+import copy
+import os
+import re
+from enum import Enum
+
+import torch
+from torch import nn
+
+from opensora.datasets import save_sample
+from opensora.datasets.aspect import get_image_size
+from opensora.datasets.utils import read_from_path, rescale_image_by_path
+from opensora.utils.logger import log_message
+from opensora.utils.prompt_refine import refine_prompts
+
+
+class SamplingMethod(Enum):
+    I2V = "i2v"  # for open sora video generation
+    DISTILLED = "distill"  # for flux image generation
+
+
+def create_tmp_csv(save_dir: str, prompt: str, ref: str = None, create=True) -> str:
+    """
+    Create a temporary CSV file with the prompt text.
+
+    Args:
+        save_dir (str): The directory where the CSV file will be saved.
+        prompt (str): The prompt text.
+
+    Returns:
+        str: The path to the temporary CSV file.
+    """
+    tmp_file = os.path.join(save_dir, "prompt.csv")
+    if not create:
+        return tmp_file
+    with open(tmp_file, "w", encoding="utf-8") as f:
+        if ref is not None:
+            f.write(f'text,ref\n"{prompt}","{ref}"')
+        else:
+            f.write(f'text\n"{prompt}"')
+    return tmp_file
+
+
+def modify_option_to_t2i(sampling_option, distilled: bool = False, img_resolution: str = "1080px"):
+    """
+    Modify the sampling option to be used for text-to-image generation.
+    """
+    sampling_option_t2i = copy.copy(sampling_option)
+    if distilled:
+        sampling_option_t2i.method = SamplingMethod.DISTILLED
+    sampling_option_t2i.num_frames = 1
+    sampling_option_t2i.height, sampling_option_t2i.width = get_image_size(img_resolution, sampling_option.aspect_ratio)
+    sampling_option_t2i.guidance = 4.0
+    sampling_option_t2i.resized_resolution = sampling_option.resolution
+
+    return sampling_option_t2i
+
+
+def get_save_path_name(
+    save_dir,
+    sub_dir,
+    save_prefix="",
+    name=None,
+    fallback_name=None,
+    index=None,
+    num_sample_pos=None,  # idx for prompt as path
+    prompt_as_path=False,  # save sample with same name as prompt
+    prompt=None,
+):
+    """
+    Get the save path for the generated samples.
+    """
+    if prompt_as_path:  # for vbench
+        cleaned_prompt = prompt.strip(".")
+        fname = f"{cleaned_prompt}-{num_sample_pos}"
+    else:
+        if name is not None:
+            fname = save_prefix + name
+        else:
+            fname = f"{save_prefix + fallback_name}_{index:04d}"
+        if num_sample_pos > 0:
+            fname += f"_{num_sample_pos}"
+
+    return os.path.join(save_dir, sub_dir, fname)
+
+
+def get_names_from_path(path):
+    """
+    Get the filename and extension from a path.
+
+    Args:
+        path (str): The path to the file.
+
+    Returns:
+        tuple[str, str]: The filename and the extension.
+    """
+    filename = os.path.basename(path)
+    name, _ = os.path.splitext(filename)
+    return name
+
+
+def process_and_save(
+    x: torch.Tensor,
+    batch: dict,
+    cfg: dict,
+    sub_dir: str,
+    generate_sampling_option,
+    epoch: int,
+    start_index: int,
+    saving: bool = True,
+):
+    """
+    Process the generated samples and save them to disk.
+    """
+    fallback_name = cfg.dataset.data_path.split("/")[-1].split(".")[0]
+    prompt_as_path = cfg.get("prompt_as_path", False)
+    fps_save = cfg.get("fps_save", 16)
+    save_dir = cfg.save_dir
+
+    names = batch["name"] if "name" in batch else [None] * len(x)
+    indices = batch["index"] if "index" in batch else [None] * len(x)
+    if "index" in batch:
+        indices = [idx + start_index for idx in indices]
+    prompts = batch["text"]
+
+    ret_names = []
+    is_image = generate_sampling_option.num_frames == 1
+    for img, name, index, prompt in zip(x, names, indices, prompts):
+        # == get save path ==
+        save_path = get_save_path_name(
+            save_dir,
+            sub_dir,
+            save_prefix=cfg.get("save_prefix", ""),
+            name=name,
+            fallback_name=fallback_name,
+            index=index,
+            num_sample_pos=epoch,
+            prompt_as_path=prompt_as_path,
+            prompt=prompt,
+        )
+        ret_name = get_names_from_path(save_path)
+        ret_names.append(ret_name)
+
+        if saving:
+            # == write txt to disk ==
+            with open(save_path + ".txt", "w", encoding="utf-8") as f:
+                f.write(prompt)
+
+            # == save samples ==
+            save_sample(img, save_path=save_path, fps=fps_save)
+
+            # == resize image for t2i2v ==
+            if (
+                cfg.get("use_t2i2v", False)
+                and is_image
+                and generate_sampling_option.resolution != generate_sampling_option.resized_resolution
+            ):
+                log_message("Rescaling image to %s...", generate_sampling_option.resized_resolution)
+                height, width = get_image_size(
+                    generate_sampling_option.resized_resolution, generate_sampling_option.aspect_ratio
+                )
+                rescale_image_by_path(save_path + ".png", width, height)
+
+    return ret_names
+
+
+def check_fps_added(sentence):
+    """
+    Check if the sentence ends with the FPS information.
+    """
+    pattern = r"\d+ FPS\.$"
+    if re.search(pattern, sentence):
+        return True
+    return False
+
+
+def ensure_sentence_ends_with_period(sentence: str):
+    """
+    Ensure that the sentence ends with a period.
+    """
+    sentence = sentence.strip()
+    if not sentence.endswith("."):
+        sentence += "."
+    return sentence
+
+
+def add_fps_info_to_text(text: list[str], fps: int = 16):
+    """
+    Add the FPS information to the text.
+    """
+    mod_text = []
+    for item in text:
+        item = ensure_sentence_ends_with_period(item)
+        if not check_fps_added(item):
+            item = item + f" {fps} FPS."
+        mod_text.append(item)
+    return mod_text
+
+
+def add_motion_score_to_text(text, motion_score: int | str):
+    """
+    Add the motion score to the text.
+    """
+    if motion_score == "dynamic":
+        ms = refine_prompts(text, type="motion_score")
+        return [f"{t} {ms[i]}." for i, t in enumerate(text)]
+    else:
+        return [f"{t} {motion_score} motion score." for t in text]
+
+
+def add_noise_to_ref(masked_ref: torch.Tensor, masks: torch.Tensor, t: float, sigma_min: float = 1e-5):
+    z_1 = torch.randn_like(masked_ref)
+    z_noisy = (1 - (1 - sigma_min) * t) * masked_ref + t * z_1
+    return masks * z_noisy
+
+
+def collect_references_batch(
+    reference_paths: list[str],
+    cond_type: str,
+    model_ae: nn.Module,
+    image_size: tuple[int, int],
+    is_causal=False,
+):
+    refs_x = []  # refs_x: [batch, ref_num, C, T, H, W]
+    device = next(model_ae.parameters()).device
+    dtype = next(model_ae.parameters()).dtype
+    for reference_path in reference_paths:
+        if reference_path == "":
+            refs_x.append(None)
+            continue
+        ref_path = reference_path.split(";")
+        ref = []
+
+        if "v2v" in cond_type:
+            r = read_from_path(ref_path[0], image_size, transform_name="resize_crop")  # size [C, T, H, W]
+            actual_t = r.size(1)
+            target_t = (
+                64 if (actual_t >= 64 and "easy" in cond_type) else 32
+            )  # if reference not long enough, default to shorter ref
+            if is_causal:
+                target_t += 1
+            assert actual_t >= target_t, f"need at least {target_t} reference frames for v2v generation"
+            if "head" in cond_type:  # v2v head
+                r = r[:, :target_t]
+            elif "tail" in cond_type:  # v2v tail
+                r = r[:, -target_t:]
+            else:
+                raise NotImplementedError
+            r_x = model_ae.encode(r.unsqueeze(0).to(device, dtype))
+            r_x = r_x.squeeze(0)  # size [C, T, H, W]
+            ref.append(r_x)
+        elif cond_type == "i2v_head":  # take the 1st frame from first ref_path
+            r = read_from_path(ref_path[0], image_size, transform_name="resize_crop")  # size [C, T, H, W]
+            r = r[:, :1]
+            r_x = model_ae.encode(r.unsqueeze(0).to(device, dtype))
+            r_x = r_x.squeeze(0)  # size [C, T, H, W]
+            ref.append(r_x)
+        elif cond_type == "i2v_tail":  # take the last frame from last ref_path
+            r = read_from_path(ref_path[-1], image_size, transform_name="resize_crop")  # size [C, T, H, W]
+            r = r[:, -1:]
+            r_x = model_ae.encode(r.unsqueeze(0).to(device, dtype))
+            r_x = r_x.squeeze(0)  # size [C, T, H, W]
+            ref.append(r_x)
+        elif cond_type == "i2v_loop":
+            # first frame
+            r_head = read_from_path(ref_path[0], image_size, transform_name="resize_crop")  # size [C, T, H, W]
+            r_head = r_head[:, :1]
+            r_x_head = model_ae.encode(r_head.unsqueeze(0).to(device, dtype))
+            r_x_head = r_x_head.squeeze(0)  # size [C, T, H, W]
+            ref.append(r_x_head)
+            # last frame
+            r_tail = read_from_path(ref_path[-1], image_size, transform_name="resize_crop")  # size [C, T, H, W]
+            r_tail = r_tail[:, -1:]
+            r_x_tail = model_ae.encode(r_tail.unsqueeze(0).to(device, dtype))
+            r_x_tail = r_x_tail.squeeze(0)  # size [C, T, H, W]
+            ref.append(r_x_tail)
+        else:
+            raise NotImplementedError(f"Unknown condition type {cond_type}")
+
+        refs_x.append(ref)
+    return refs_x
+
+
+def prepare_inference_condition(
+    z: torch.Tensor,
+    mask_cond: str,
+    ref_list: list[list[torch.Tensor]] = None,
+    causal: bool = True,
+) -> torch.Tensor:
+    """
+    Prepare the visual condition for the model, using causal vae.
+
+    Args:
+        z (torch.Tensor): The latent noise tensor, of shape [B, C, T, H, W]
+        mask_cond (dict): The condition configuration.
+        ref_list: list of lists of media (image/video) for i2v and v2v condition, of shape [C, T', H, W]; len(ref_list)==B; ref_list[i] is the list of media for the generation in batch idx i, we use a list of media for each batch item so that it can have multiple references. For example, ref_list[i] could be [ref_image_1, ref_image_2] for i2v_loop condition.
+
+    Returns:
+        torch.Tensor: The visual condition tensor.
+    """
+    # x has shape [b, c, t, h, w], where b is the batch size
+    B, C, T, H, W = z.shape
+
+    masks = torch.zeros(B, 1, T, H, W)
+    masked_z = torch.zeros(B, C, T, H, W)
+
+    if ref_list is None:
+        assert mask_cond == "t2v", f"reference is required for {mask_cond}"
+
+    for i in range(B):
+        ref = ref_list[i]
+
+        # warning message
+        if ref is None and mask_cond != "t2v":
+            print("no reference found. will default to cond_type t2v!")
+
+        if ref is not None and T > 1:  # video
+            # Apply the selected mask condition directly on the masks tensor
+            if mask_cond == "i2v_head":  # equivalent to masking the first timestep
+                masks[i, :, 0, :, :] = 1
+                masked_z[i, :, 0, :, :] = ref[0][:, 0, :, :]
+            elif mask_cond == "i2v_tail":  # mask the last timestep
+                masks[i, :, -1, :, :] = 1
+                masked_z[i, :, -1, :, :] = ref[-1][:, -1, :, :]
+            elif mask_cond == "v2v_head":
+                k = 8 + int(causal)
+                masks[i, :, :k, :, :] = 1
+                masked_z[i, :, :k, :, :] = ref[0][:, :k, :, :]
+            elif mask_cond == "v2v_tail":
+                k = 8 + int(causal)
+                masks[i, :, -k:, :, :] = 1
+                masked_z[i, :, -k:, :, :] = ref[0][:, -k:, :, :]
+            elif mask_cond == "v2v_head_easy":
+                k = 16 + int(causal)
+                masks[i, :, :k, :, :] = 1
+                masked_z[i, :, :k, :, :] = ref[0][:, :k, :, :]
+            elif mask_cond == "v2v_tail_easy":
+                k = 16 + int(causal)
+                masks[i, :, -k:, :, :] = 1
+                masked_z[i, :, -k:, :, :] = ref[0][:, -k:, :, :]
+            elif mask_cond == "i2v_loop":  # mask first and last timesteps
+                masks[i, :, 0, :, :] = 1
+                masks[i, :, -1, :, :] = 1
+                masked_z[i, :, 0, :, :] = ref[0][:, 0, :, :]
+                masked_z[i, :, -1, :, :] = ref[-1][:, -1, :, :]  # last frame of last referenced content
+            else:
+                # "t2v" is the fallback case where no specific condition is specified
+                assert mask_cond == "t2v", f"Unknown mask condition {mask_cond}"
+
+    masks = masks.to(z.device, z.dtype)
+    masked_z = masked_z.to(z.device, z.dtype)
+    return masks, masked_z
diff --git a/opensora/utils/logger.py b/opensora/utils/logger.py
new file mode 100644
index 0000000..06c4a45
--- /dev/null
+++ b/opensora/utils/logger.py
@@ -0,0 +1,90 @@
+import logging
+import os
+
+import torch.distributed as dist
+
+
+def is_distributed() -> bool:
+    """
+    Check if the code is running in a distributed setting.
+
+    Returns:
+        bool: True if running in a distributed setting, False otherwise
+    """
+    return os.environ.get("WORLD_SIZE", None) is not None
+
+
+def is_main_process() -> bool:
+    """
+    Check if the current process is the main process.
+
+    Returns:
+        bool: True if the current process is the main process, False otherwise.
+    """
+    return not is_distributed() or dist.get_rank() == 0
+
+
+def get_world_size() -> int:
+    """
+    Get the number of processes in the distributed setting.
+
+    Returns:
+        int: The number of processes.
+    """
+    if is_distributed():
+        return dist.get_world_size()
+    else:
+        return 1
+
+
+def create_logger(logging_dir: str = None) -> logging.Logger:
+    """
+    Create a logger that writes to a log file and stdout. Only the main process logs.
+
+    Args:
+        logging_dir (str): The directory to save the log file.
+
+    Returns:
+        logging.Logger: The logger.
+    """
+    if is_main_process():
+        additional_args = dict()
+        if logging_dir is not None:
+            additional_args["handlers"] = [
+                logging.StreamHandler(),
+                logging.FileHandler(f"{logging_dir}/log.txt"),
+            ]
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[\033[34m%(asctime)s\033[0m] %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+            **additional_args,
+        )
+        logger = logging.getLogger(__name__)
+        if logging_dir is not None:
+            logger.info("Experiment directory created at %s", logging_dir)
+    else:
+        logger = logging.getLogger(__name__)
+        logger.addHandler(logging.NullHandler())
+    return logger
+
+
+def log_message(*args, level: str = "info"):
+    """
+    Log a message to the logger.
+
+    Args:
+        *args: The message to log.
+        level (str): The logging level.
+    """
+    logger = logging.getLogger(__name__)
+    if level == "info":
+        logger.info(*args)
+    elif level == "warning":
+        logger.warning(*args)
+    elif level == "error":
+        logger.error(*args)
+    elif level == "print":
+        print(*args)
+    else:
+        raise ValueError(f"Invalid logging level: {level}")
diff --git a/opensora/utils/misc.py b/opensora/utils/misc.py
new file mode 100644
index 0000000..3b4bed6
--- /dev/null
+++ b/opensora/utils/misc.py
@@ -0,0 +1,438 @@
+import os
+import time
+from collections import OrderedDict
+from collections.abc import Sequence
+from contextlib import nullcontext
+
+import numpy as np
+import psutil
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from colossalai.cluster.dist_coordinator import DistCoordinator
+from torch.utils.tensorboard import SummaryWriter
+
+from opensora.acceleration.parallel_states import get_data_parallel_group
+
+from .logger import log_message
+
+
+def create_tensorboard_writer(exp_dir: str) -> SummaryWriter:
+    """
+    Create a tensorboard writer.
+
+    Args:
+        exp_dir (str): The directory to save tensorboard logs.
+
+    Returns:
+        SummaryWriter: The tensorboard writer.
+    """
+    tensorboard_dir = f"{exp_dir}/tensorboard"
+    os.makedirs(tensorboard_dir, exist_ok=True)
+    writer = SummaryWriter(tensorboard_dir)
+    return writer
+
+
+# ======================================================
+# Memory
+# ======================================================
+
+GIGABYTE = 1024**3
+
+
+def log_cuda_memory(stage: str = None):
+    """
+    Log the current CUDA memory usage.
+
+    Args:
+        stage (str): The stage of the training process.
+    """
+    text = "CUDA memory usage"
+    if stage is not None:
+        text += f" at {stage}"
+    log_message(text + ": %.1f GB", torch.cuda.memory_allocated() / GIGABYTE)
+
+
+def log_cuda_max_memory(stage: str = None):
+    """
+    Log the max CUDA memory usage.
+
+    Args:
+        stage (str): The stage of the training process.
+    """
+    torch.cuda.synchronize()
+    max_memory_allocated = torch.cuda.max_memory_allocated()
+    max_memory_reserved = torch.cuda.max_memory_reserved()
+    log_message("CUDA max memory max memory allocated at " + stage + ": %.1f GB", max_memory_allocated / GIGABYTE)
+    log_message("CUDA max memory max memory reserved at " + stage + ": %.1f GB", max_memory_reserved / GIGABYTE)
+
+
+# ======================================================
+# Number of parameters
+# ======================================================
+
+
+def get_model_numel(model: torch.nn.Module) -> tuple[int, int]:
+    """
+    Get the number of parameters in a model.
+
+    Args:
+        model (torch.nn.Module): The model.
+
+    Returns:
+        tuple[int, int]: The total number of parameters and the number of trainable parameters.
+    """
+    num_params = 0
+    num_params_trainable = 0
+    for p in model.parameters():
+        num_params += p.numel()
+        if p.requires_grad:
+            num_params_trainable += p.numel()
+    return num_params, num_params_trainable
+
+
+def log_model_params(model: nn.Module):
+    """
+    Log the number of parameters in a model.
+
+    Args:
+        model (torch.nn.Module): The model.
+    """
+    num_params, num_params_trainable = get_model_numel(model)
+    model_name = model.__class__.__name__
+    log_message(f"[{model_name}] Number of parameters: {format_numel_str(num_params)}")
+    log_message(f"[{model_name}] Number of trainable parameters: {format_numel_str(num_params_trainable)}")
+
+
+# ======================================================
+# String
+# ======================================================
+
+
+def format_numel_str(numel: int) -> str:
+    """
+    Format a number of elements to a human-readable string.
+
+    Args:
+        numel (int): The number of elements.
+
+    Returns:
+        str: The formatted string.
+    """
+    B = 1024**3
+    M = 1024**2
+    K = 1024
+    if numel >= B:
+        return f"{numel / B:.2f} B"
+    elif numel >= M:
+        return f"{numel / M:.2f} M"
+    elif numel >= K:
+        return f"{numel / K:.2f} K"
+    else:
+        return f"{numel}"
+
+
+def format_duration(seconds: int) -> str:
+    days, remainder = divmod(seconds, 86400)  # Extract days
+    hours, remainder = divmod(remainder, 3600)  # Extract hours
+    minutes, seconds = divmod(remainder, 60)  # Extract minutes and seconds
+
+    parts = []
+    if days > 0:
+        parts.append(f"{days}d")
+    if hours > 0:
+        parts.append(f"{hours}h")
+    if minutes > 0:
+        parts.append(f"{minutes}m")
+    if seconds > 0 or not parts:  # Always show seconds if nothing else
+        parts.append(f"{seconds}s")
+
+    return " ".join(parts)
+
+
+# ======================================================
+# PyTorch
+# ======================================================
+
+
+def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
+    dist.all_reduce(tensor=tensor, group=get_data_parallel_group())
+    tensor.div_(dist.get_world_size(group=get_data_parallel_group()))
+    return tensor
+
+
+def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
+    dist.all_reduce(tensor=tensor, group=get_data_parallel_group())
+    return tensor
+
+
+def to_tensor(data: torch.Tensor | np.ndarray | Sequence | int | float) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: The converted tensor.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not isinstance(data, str):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f"type {type(data)} cannot be converted to tensor.")
+
+
+def to_ndarray(data: torch.Tensor | np.ndarray | Sequence | int | float) -> np.ndarray:
+    """Convert objects of various python types to :obj:`numpy.ndarray`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        numpy.ndarray: The converted ndarray.
+    """
+    if isinstance(data, torch.Tensor):
+        return data.numpy()
+    elif isinstance(data, np.ndarray):
+        return data
+    elif isinstance(data, Sequence):
+        return np.array(data)
+    elif isinstance(data, int):
+        return np.ndarray([data], dtype=int)
+    elif isinstance(data, float):
+        return np.array([data], dtype=float)
+    else:
+        raise TypeError(f"type {type(data)} cannot be converted to ndarray.")
+
+
+def to_torch_dtype(dtype: str | torch.dtype) -> torch.dtype:
+    """
+    Convert a string or a torch.dtype to a torch.dtype.
+
+    Args:
+        dtype (str | torch.dtype): The input dtype.
+
+    Returns:
+        torch.dtype: The converted dtype.
+    """
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    elif isinstance(dtype, str):
+        dtype_mapping = {
+            "float64": torch.float64,
+            "float32": torch.float32,
+            "float16": torch.float16,
+            "fp32": torch.float32,
+            "fp16": torch.float16,
+            "half": torch.float16,
+            "bf16": torch.bfloat16,
+        }
+        if dtype not in dtype_mapping:
+            raise ValueError(f"Unsupported dtype {dtype}")
+        dtype = dtype_mapping[dtype]
+        return dtype
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
+
+
+# ======================================================
+# Profile
+# ======================================================
+
+
+class Timer:
+    def __init__(self, name, log=False, barrier=False, coordinator: DistCoordinator | None = None):
+        self.name = name
+        self.start_time = None
+        self.end_time = None
+        self.log = log
+        self.barrier = barrier
+        self.coordinator = coordinator
+
+    @property
+    def elapsed_time(self) -> float:
+        return self.end_time - self.start_time
+
+    def __enter__(self):
+        torch.cuda.synchronize()
+        if self.barrier:
+            dist.barrier()
+        self.start_time = time.time()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.coordinator is not None:
+            self.coordinator.block_all()
+        torch.cuda.synchronize()
+        if self.barrier:
+            dist.barrier()
+        self.end_time = time.time()
+        if self.log:
+            print(f"Elapsed time for {self.name}: {self.elapsed_time:.2f} s")
+
+
+class Timers:
+    def __init__(self, record_time: bool, record_barrier: bool = False, coordinator: DistCoordinator | None = None):
+        self.timers = OrderedDict()
+        self.record_time = record_time
+        self.record_barrier = record_barrier
+        self.coordinator = coordinator
+
+    def __getitem__(self, name: str) -> Timer:
+        if name not in self.timers:
+            if self.record_time:
+                self.timers[name] = Timer(name, barrier=self.record_barrier, coordinator=self.coordinator)
+            else:
+                self.timers[name] = nullcontext()
+        return self.timers[name]
+
+    def to_dict(self):
+        return {f"time_debug/{name}": timer.elapsed_time for name, timer in self.timers.items()}
+
+    def to_str(self, epoch: int, step: int) -> str:
+        log_str = f"Rank {dist.get_rank()} | Epoch {epoch} | Step {step} | "
+        for name, timer in self.timers.items():
+            log_str += f"{name}: {timer.elapsed_time:.2f} s | "
+        return log_str
+
+
+def is_pipeline_enabled(plugin_type: str, plugin_config: dict) -> bool:
+    return plugin_type == "hybrid" and plugin_config.get("pp_size", 1) > 1
+
+
+def is_log_process(plugin_type: str, plugin_config: dict) -> bool:
+    if is_pipeline_enabled(plugin_type, plugin_config):
+        return dist.get_rank() == dist.get_world_size() - 1
+    return dist.get_rank() == 0
+
+
+class NsysRange:
+    def __init__(self, range_name: str):
+        self.range_name = range_name
+
+    def __enter__(self):
+        torch.cuda.nvtx.range_push(self.range_name)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch.cuda.nvtx.range_pop()
+
+
+class NsysProfiler:
+    """
+    Use NVIDIA Nsight Systems to profile the code.
+
+    Example (~30MB):
+    ```bash
+    /home/zhengzangwei/nsight-systems-2024.7.1/bin/nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas --capture-range=cudaProfilerApi --capture-range-end=stop-shutdown  -o cache/nsys/report2 \
+        torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage2.py --nsys True --dataset.data-path /mnt/ddn/sora/meta/train/all_till_20241115_chunk901+img7.6M.parquet
+    ```
+
+    Example (~130MB + 2G):
+    ```bash
+    /home/zhengzangwei/nsight-systems-2024.7.1/bin/nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas --capture-range=cudaProfilerApi --capture-range-end=stop-shutdown -s process-tree --cudabacktrace=all --stats=true -o cache/nsys/report5 \
+        torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage2.py --nsys True --dataset.data-path /mnt/ddn/sora/meta/train/all_till_20241115_chunk901+img7.6M.parquet --record_time True --record_barrier True
+    ```
+
+    To generate summary statistics, use `--stats=true`.
+    To disable stack traces, use use `-s none --cudabacktrace=none`.
+    To use stack traces, use `-s process-tree --cudabacktrace=all`.
+    To enable timer, use `--record_time True --record_barrier True` for `scripts/diffusion/train.py`.
+    """
+
+    def __init__(self, warmup_steps: int = 0, num_steps: int = 1, enabled: bool = True):
+        self.warmup_steps = warmup_steps
+        self.num_steps = num_steps
+        self.current_step = 0
+        self.enabled = enabled
+
+    def step(self):
+        if not self.enabled:
+            return
+        self.current_step += 1
+        if self.current_step == self.warmup_steps:
+            torch.cuda.cudart().cudaProfilerStart()
+        elif self.current_step >= self.warmup_steps + self.num_steps:
+            torch.cuda.cudart().cudaProfilerStop()
+
+    def range(self, range_name: str) -> NsysRange:
+        if not self.enabled:
+            return nullcontext()
+        return NsysRange(range_name)
+
+
+class ProfilerContext:
+    def __init__(
+        self,
+        save_path: str = "./log",
+        record_shapes: bool = False,
+        with_stack: bool = True,
+        wait: int = 1,
+        warmup: int = 1,
+        active: int = 1,
+        repeat: int = 1,
+        enable: bool = True,
+        **kwargs,
+    ):
+        self.enable = enable
+        self.prof = None
+        self.step_cnt = 0
+        self.total_steps = (wait + warmup + active) * repeat
+        if enable:
+            self.prof = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat),
+                record_shapes=record_shapes,
+                with_stack=with_stack,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(save_path),
+                **kwargs,
+            )
+
+    def step(self):
+        if self.enable:
+            if self.step_cnt == 0:
+                self.prof.__enter__()
+            self.prof.step()
+            self.step_cnt += 1
+            if self.is_profile_end():
+                self.prof.__exit__(None, None, None)
+                exit(0)
+
+    def is_profile_end(self):
+        return self.step_cnt >= self.total_steps
+
+
+def get_process_mem():
+    process = psutil.Process(os.getpid())
+    return process.memory_info().rss / 1024**3
+
+
+def get_total_mem():
+    return psutil.virtual_memory().used / 1024**3
+
+
+def print_mem(prefix: str = ""):
+    rank = dist.get_rank()
+    print(
+        f"[{rank}] {prefix} process memory: {get_process_mem():.2f} GB, total memory: {get_total_mem():.2f} GB",
+        flush=True,
+    )
diff --git a/opensora/utils/optimizer.py b/opensora/utils/optimizer.py
new file mode 100644
index 0000000..ad9e5a0
--- /dev/null
+++ b/opensora/utils/optimizer.py
@@ -0,0 +1,91 @@
+import torch
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+def create_optimizer(
+    model: torch.nn.Module,
+    optimizer_config: dict,
+) -> torch.optim.Optimizer:
+    """
+    Create an optimizer.
+
+    Args:
+        model (torch.nn.Module): The model to be optimized.
+        optimizer_config (dict): The configuration of the optimizer.
+
+    Returns:
+        torch.optim.Optimizer: The optimizer.
+    """
+    optimizer_name = optimizer_config.pop("cls", "HybridAdam")
+    if optimizer_name == "HybridAdam":
+        optimizer_cls = HybridAdam
+    else:
+        raise ValueError(f"Unknown optimizer: {optimizer_name}")
+    optimizer = optimizer_cls(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        **optimizer_config,
+    )
+    return optimizer
+
+
+def create_lr_scheduler(
+    optimizer: torch.optim.Optimizer,
+    num_steps_per_epoch: int,
+    epochs: int = 1000,
+    warmup_steps: int | None = None,
+    use_cosine_scheduler: bool = False,
+    initial_lr: float = 1e-6,
+) -> _LRScheduler | None:
+    """
+    Create a learning rate scheduler.
+
+    Args:
+        optimizer (torch.optim.Optimizer): The optimizer to be used.
+        num_steps_per_epoch (int): The number of steps per epoch.
+        epochs (int): The number of epochs.
+        warmup_steps (int |  None): The number of warmup steps.
+        use_cosine_scheduler (bool): Whether to use cosine scheduler.
+
+    Returns:
+        _LRScheduler |  None: The learning rate scheduler
+    """
+    if warmup_steps is None and not use_cosine_scheduler:
+        lr_scheduler = None
+    elif use_cosine_scheduler:
+        lr_scheduler = CosineAnnealingWarmupLR(
+            optimizer,
+            total_steps=num_steps_per_epoch * epochs,
+            warmup_steps=warmup_steps,
+        )
+    else:
+        lr_scheduler = LinearWarmupLR(optimizer, initial_lr=1e-6, warmup_steps=warmup_steps)
+        # lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=warmup_steps)
+
+    return lr_scheduler
+
+
+class LinearWarmupLR(_LRScheduler):
+    """Linearly warmup learning rate and then linearly decay.
+
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0
+        last_step (int, optional): The index of last step, defaults to -1. When last_step=-1,
+            the schedule is started from the beginning or When last_step=-1, sets initial lr as lr.
+    """
+
+    def __init__(self, optimizer, initial_lr=0, warmup_steps: int = 0, last_epoch: int = -1):
+        self.initial_lr = initial_lr
+        self.warmup_steps = warmup_steps
+        super().__init__(optimizer, last_epoch=last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return [
+                self.initial_lr + (self.last_epoch + 1) / (self.warmup_steps + 1) * (lr - self.initial_lr)
+                for lr in self.base_lrs
+            ]
+        else:
+            return self.base_lrs
diff --git a/opensora/utils/prompt_refine.py b/opensora/utils/prompt_refine.py
new file mode 100644
index 0000000..bd45268
--- /dev/null
+++ b/opensora/utils/prompt_refine.py
@@ -0,0 +1,234 @@
+import base64
+import os
+from mimetypes import guess_type
+
+from openai import OpenAI
+
+sys_prompt_t2v = """You are part of a team of bots that creates videos. The workflow is that you first create a caption of the video, and then the assistant bot will generate the video based on the caption. You work with an assistant bot that will draw anything you say.
+
+For example, outputting "a beautiful morning in the woods with the sun peaking through the trees" will trigger your partner bot to output an video of a forest morning, as described. You will be prompted by people looking to create detailed, amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
+
+There are a few rules to follow:
+
+You will only ever output a single video description per user request.
+
+You should not simply make the description longer.
+
+Video descriptions must have the same num of words as examples below. Extra words will be ignored.
+"""
+
+sys_prompt_t2i = """You are part of a team of bots that creates videos. The workflow is that you first create an image caption for the first frame of the video, and then the assistant bot will generate the video based on the image caption.
+
+For example, outputting "a beautiful morning in the woods with the sun peaking through the trees" will trigger your partner bot to output an image of a forest morning, as described. You will be prompted by people looking to create detailed, amazing videos. The way to accomplish this is to take their short prompts and make them extremely detailed and descriptive.
+
+There are a few rules to follow:
+
+You will only ever output a single image description per user request.
+
+You should not simply make the description longer.
+
+Image captions must have the same num of words as examples. Extra words will be ignored.
+
+Note: The input image is the first frame of the video, and the output image caption should include dynamic information.
+
+Note: Don't contain camera transitions!!! Don't contain screen switching!!! Don't contain perspective shifts !!!
+
+Note: Use daily language to describe the video, don't use complex words or phrases!!!
+"""
+
+sys_prompt_i2v = """You are part of a team of bots that creates videos. The workflow is that you first create a caption of the video based on the image, and then the assistant bot will generate the video based on the caption. You work with an assistant bot that will draw anything you say.
+
+Give a highly descriptive video caption based on input image and user input. As an expert, delve deep into the image with a discerning eye, leveraging rich creativity, meticulous thought. When describing the details of an video, include appropriate dynamic information to ensure that the video caption contains reasonable actions and plots. If user input is not empty, then the caption should be expanded according to the user's input.
+
+The input image is the first frame of the video, and the output video caption should describe the motion starting from the current image. User input is optional and can be empty.
+
+Answers should be comprehensive, conversational, and use complete sentences. The answer should be in English no matter what the user's input is. Provide context where necessary and maintain a certain tone.  Begin directly without introductory phrases like "The image/video showcases" "The photo captures" and more. For example, say "A scene of a woman on a beach", instead of "A woman is depicted in the image".
+
+Note: Must include appropriate dynamic information like actions, plots, etc. If the user prompt did not contain any dynamic information, then you must add some proper dynamic information like actions to make the video move!!!
+
+Note: Try begin the sentence with phrases like  "A scene of" or "A view of" or "A close-up of" to make the video more descriptive!!!
+
+Note: Use daily language to describe the video, don't use complex words or phrases!!!
+"""
+
+sys_prompt_motion_score = """
+We define a video’s motion score as its FFMPEG VMAF motion value. We now have a video generation model that accepts a desired VMAF motion value as input. To reduce user burden, please predict an optimal motion score for generating a high-quality video based on the user’s text prompt. For reference:
+	•	For runway videos featuring models, a motion score of 4 is ideal.
+	•	For static videos, a motion score of 1 is preferred.
+
+Output format:
+“{} motion score”, where {} is an integer between 1 and 15.
+
+User input:
+"""
+
+
+def image_to_url(image_path):
+    mime_type, _ = guess_type(image_path)
+    if mime_type is None:
+        mime_type = "application/octet-stream"
+    with open(image_path, "rb") as image_file:
+        base64_encoded_data = base64.b64encode(image_file.read()).decode("utf-8")
+    return f"data:{mime_type};base64,{base64_encoded_data}"
+
+
+def refine_prompt(prompt: str, retry_times: int = 3, type: str = "t2v", image_path: str = None):
+    """
+    Refine a prompt to a format that can be used by the model for inference
+    """
+
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    text = prompt.strip()
+    response = None
+    for i in range(retry_times):
+        if type == "t2v":
+            response = client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": f"{sys_prompt_t2v}"},
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "A street with parked cars on both sides, lined with commercial buildings featuring Korean signs. The overcast sky suggests early morning or late afternoon."',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A view of a street lined with parked cars on both sides. the buildings flanking the street have various signs and advertisements, some of which are in korean, indicating that this might be a location in south korea. the sky is overcast, suggesting either early morning or late afternoon light. the architecture of the buildings is typical of urban commercial areas, with storefronts on the ground level and possibly offices or residences above.",
+                    },
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "Hands with rings and bracelets wash small greenish-brown seeds in a blue basin under running water, likely outdoors."',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A close-up shot of a person's hands, adorned with rings and bracelets, washing a pile of small, round, greenish-brown seeds in a blue plastic basin. the water is running from an unseen source, likely a tap, and the person is using their hands to agitate the seeds, presumably to clean them. the background is indistinct but appears to be an outdoor setting with natural light.",
+                    },
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "Three men stand near an open black car in a parking lot, with parked vehicles and a partly cloudy sky in the background."',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A scene showing three men in an outdoor setting, likely a parking lot. the man on the left is wearing a light blue shirt and dark shorts, the man in the middle is dressed in a white shirt with a pattern and dark shorts, and the man on the right is wearing a green shirt and jeans. they are standing near a black car with its door open. in the background, there are parked vehicles, including a white truck and a red trailer. the sky is partly cloudy, suggesting it might be a sunny day.",
+                    },
+                    {
+                        "role": "user",
+                        "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input: " {text} "',
+                    },
+                ],
+                model="gpt-4o",  # glm-4-plus and gpt-4o have be tested
+                temperature=0.01,
+                top_p=0.7,
+                stream=False,
+                max_tokens=250,
+            )
+        elif type == "t2i":
+            response = client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": f"{sys_prompt_t2i}"},
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative image descriptive caption or modify an earlier caption for the user input : "a girl on the beach"',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A radiant woman stands on a deserted beach, arms outstretched, wearing a beige trench coat, white blouse, light blue jeans, and chic boots, against a backdrop of soft sky and sea.",
+                    },
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative image descriptive caption or modify an earlier caption for the user input : "A man in a blue shirt"',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A determined man in athletic attire, including a blue long-sleeve shirt, black shorts, and blue socks, against a backdrop of a snowy field.",
+                    },
+                    {
+                        "role": "user",
+                        "content": f'Create an imaginative image descriptive caption or modify an earlier caption in ENGLISH for the user input: " {text} "',
+                    },
+                ],
+                model="gpt-4o",  # glm-4-plus and gpt-4o have be tested
+                temperature=0.01,
+                top_p=0.7,
+                stream=False,
+                max_tokens=250,
+            )
+        elif type == "i2v":
+            response = client.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {"role": "system", "content": f"{sys_prompt_i2v}"},
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "A street with parked cars on both sides, lined with commercial buildings featuring Korean signs. The overcast sky suggests early morning or late afternoon."',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A view of a street lined with parked cars on both sides. the buildings flanking the street have various signs and advertisements, some of which are in korean, indicating that this might be a location in south korea. the sky is overcast, suggesting either early morning or late afternoon light. the architecture of the buildings is typical of urban commercial areas, with storefronts on the ground level and possibly offices or residences above.",
+                    },
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "Hands with rings and bracelets wash small greenish-brown seeds in a blue basin under running water, likely outdoors."',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A close-up shot of a person's hands, adorned with rings and bracelets, washing a pile of small, round, greenish-brown seeds in a blue plastic basin. the water is running from an unseen source, likely a tap, and the person is using their hands to agitate the seeds, presumably to clean them. the background is indistinct but appears to be an outdoor setting with natural light.",
+                    },
+                    {
+                        "role": "user",
+                        "content": 'Create an imaginative video descriptive caption or modify an earlier caption for the user input : "Three men stand near an open black car in a parking lot, with parked vehicles and a partly cloudy sky in the background."',
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "A scene showing three men in an outdoor setting, likely a parking lot. the man on the left is wearing a light blue shirt and dark shorts, the man in the middle is dressed in a white shirt with a pattern and dark shorts, and the man on the right is wearing a green shirt and jeans. they are standing near a black car with its door open. in the background, there are parked vehicles, including a white truck and a red trailer. the sky is partly cloudy, suggesting it might be a sunny day.",
+                    },
+                    {
+                        "role": "user",
+                        "content": f'Create an imaginative video descriptive caption or modify an earlier caption in ENGLISH for the user input based on the image: " {text} "',
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_to_url(image_path),
+                                },
+                            },
+                        ],
+                    },
+                ],
+                temperature=0.01,
+                top_p=0.7,
+                stream=False,
+                max_tokens=250,
+            )
+        elif type == "motion_score":
+            response = client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": f"{sys_prompt_motion_score}"},
+                    {
+                        "role": "user",
+                        "content": f"{text}",
+                    },
+                ],
+                model="gpt-4o",  # glm-4-plus and gpt-4o have be tested
+                temperature=0.01,
+                top_p=0.7,
+                stream=False,
+                max_tokens=100,
+            )
+        if response is None:
+            continue
+        if response.choices:
+            return response.choices[0].message.content
+    return prompt
+
+
+def refine_prompts(prompts: list[str], retry_times: int = 3, type: str = "t2v", image_paths: list[str] = None):
+    if image_paths is None:
+        image_paths = [None] * len(prompts)
+    refined_prompts = []
+    for prompt, image_path in zip(prompts, image_paths):
+        refined_prompt = refine_prompt(prompt, retry_times=retry_times, type=type, image_path=image_path)
+        refined_prompts.append(refined_prompt)
+    return refined_prompts
diff --git a/opensora/utils/sampling.py b/opensora/utils/sampling.py
new file mode 100644
index 0000000..2059409
--- /dev/null
+++ b/opensora/utils/sampling.py
@@ -0,0 +1,726 @@
+import math
+import os
+import random
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, replace
+
+import torch
+from einops import rearrange, repeat
+from mmengine.config import Config
+from peft import PeftModel
+from torch import Tensor, nn
+
+from opensora.datasets.aspect import get_image_size
+from opensora.models.mmdit.model import MMDiTModel
+from opensora.models.text.conditioner import HFEmbedder
+from opensora.registry import MODELS, build_module
+from opensora.utils.inference import (
+    SamplingMethod,
+    collect_references_batch,
+    prepare_inference_condition,
+)
+
+# ======================================================
+# Sampling Options
+# ======================================================
+
+
+@dataclass
+class SamplingOption:
+    # The width of the image/video.
+    width: int | None = None
+
+    # The height of the image/video.
+    height: int | None = None
+
+    # The resolution of the image/video. If provided, it will override the height and width.
+    resolution: str | None = None
+
+    # The aspect ratio of the image/video. If provided, it will override the height and width.
+    aspect_ratio: str | None = None
+
+    # The number of frames.
+    num_frames: int = 1
+
+    # The number of sampling steps.
+    num_steps: int = 50
+
+    # The classifier-free guidance (text).
+    guidance: float = 4.0
+
+    # use oscillation for text guidance
+    text_osci: bool = False
+
+    # The classifier-free guidance (image), or for the guidance on condition for i2v and v2v
+    guidance_img: float | None = None
+
+    # use oscillation for image guidance
+    image_osci: bool = False
+
+    # use temporal scaling for image guidance
+    scale_temporal_osci: bool = False
+
+    # The seed for the random number generator.
+    seed: int | None = None
+
+    # Whether to shift the schedule.
+    shift: bool = True
+
+    # The sampling method.
+    method: str | SamplingMethod = SamplingMethod.I2V
+
+    # Temporal reduction
+    temporal_reduction: int = 1
+
+    # is causal vae
+    is_causal_vae: bool = False
+
+    # flow shift
+    flow_shift: float | None = None
+
+
+def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
+    """
+    Sanitize the sampling options.
+
+    Args:
+        sampling_option (SamplingOption): The sampling options.
+
+    Returns:
+        SamplingOption: The sanitized sampling options.
+    """
+    if (
+        sampling_option.resolution is not None
+        or sampling_option.aspect_ratio is not None
+    ):
+        assert (
+            sampling_option.resolution is not None
+            and sampling_option.aspect_ratio is not None
+        ), "Both resolution and aspect ratio must be provided"
+        resolution = sampling_option.resolution
+        aspect_ratio = sampling_option.aspect_ratio
+        height, width = get_image_size(resolution, aspect_ratio, training=False)
+    else:
+        assert (
+            sampling_option.height is not None and sampling_option.width is not None
+        ), "Both height and width must be provided"
+        height, width = sampling_option.height, sampling_option.width
+
+    height = (height // 16 + (1 if height % 16 else 0)) * 16
+    width = (width // 16 + (1 if width % 16 else 0)) * 16
+    replace_dict = dict(height=height, width=width)
+
+    if isinstance(sampling_option.method, str):
+        method = SamplingMethod(sampling_option.method)
+        replace_dict["method"] = method
+
+    return replace(sampling_option, **replace_dict)
+
+
+def get_oscillation_gs(guidance_scale: float, i: int, force_num=10):
+    """
+    get oscillation guidance for cfg.
+
+    Args:
+        guidance_scale: original guidance value
+        i: denoising step
+        force_num: before which don't apply oscillation
+    """
+    if i < force_num or (i >= force_num and i % 2 == 0):
+        gs = guidance_scale
+    else:
+        gs = 1.0
+    return gs
+
+
+# ======================================================
+# Denoising
+# ======================================================
+
+
+class Denoiser(ABC):
+    @abstractmethod
+    def denoise(self, model: MMDiTModel, **kwargs) -> Tensor:
+        """Denoise the input."""
+
+    @abstractmethod
+    def prepare_guidance(
+        self,
+        text: list[str],
+        optional_models: dict[str, nn.Module],
+        device: torch.device,
+        dtype: torch.dtype,
+        **kwargs,
+    ) -> dict[str, Tensor]:
+        """Prepare the guidance for the model. This method will alter text."""
+
+
+class I2VDenoiser(Denoiser):
+    def denoise(self, model: MMDiTModel, **kwargs) -> Tensor:
+        img = kwargs.pop("img")
+        timesteps = kwargs.pop("timesteps")
+        guidance = kwargs.pop("guidance")
+        guidance_img = kwargs.pop("guidance_img")
+
+        # cond ref arguments
+        masks = kwargs.pop("masks")
+        masked_ref = kwargs.pop("masked_ref")
+        kwargs.pop("sigma_min")
+
+        # oscillation guidance
+        text_osci = kwargs.pop("text_osci", False)
+        image_osci = kwargs.pop("image_osci", False)
+        scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
+
+        # patch size
+        patch_size = kwargs.pop("patch_size", 2)
+
+        guidance_vec = torch.full(
+            (img.shape[0],), guidance, device=img.device, dtype=img.dtype
+        )
+        for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
+            # timesteps
+            t_vec = torch.full(
+                (img.shape[0],), t_curr, dtype=img.dtype, device=img.device
+            )
+            b, c, t, w, h = masked_ref.size()
+            cond = torch.cat((masks, masked_ref), dim=1)
+            cond = pack(cond, patch_size=patch_size)
+            kwargs["cond"] = torch.cat([cond, cond, torch.zeros_like(cond)], dim=0)
+
+            # forward preparation
+            cond_x = img[: len(img) // 3]
+
+            img = torch.cat([cond_x, cond_x, cond_x], dim=0)
+            # forward
+            pred = model(
+                img=img,
+                **kwargs,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+            )
+
+            # prepare guidance
+            text_gs = get_oscillation_gs(guidance, i) if text_osci else guidance
+            image_gs = (
+                get_oscillation_gs(guidance_img, i) if image_osci else guidance_img
+            )
+            cond, uncond, uncond_2 = pred.chunk(3, dim=0)
+            if image_gs > 1.0 and scale_temporal_osci:
+                # image_gs decrease with each denoising step
+                step_upper_image_gs = torch.linspace(image_gs, 1.0, len(timesteps))[i]
+                # image_gs increase along the temporal axis of the latent video
+                image_gs = torch.linspace(1.0, step_upper_image_gs, t)[
+                    None, None, :, None, None
+                ].repeat(b, c, 1, h, w)
+                image_gs = pack(image_gs, patch_size=patch_size).to(cond.device, cond.dtype)
+
+            # update
+            pred = uncond_2 + image_gs * (uncond - uncond_2) + text_gs * (cond - uncond)
+            pred = torch.cat([pred, pred, pred], dim=0)
+
+            img = img + (t_prev - t_curr) * pred
+
+        img = img[: len(img) // 3]
+
+        return img
+
+    def prepare_guidance(
+        self,
+        text: list[str],
+        optional_models: dict[str, nn.Module],
+        device: torch.device,
+        dtype: torch.dtype,
+        **kwargs,
+    ) -> tuple[list[str], dict[str, Tensor]]:
+        ret = {}
+
+        neg = kwargs.get("neg", None)
+        ret["guidance_img"] = kwargs.pop("guidance_img")
+
+        # text
+        if neg is None:
+            neg = [""] * len(text)
+        text = text + neg + neg
+        return text, ret
+
+
+class DistilledDenoiser(Denoiser):
+    def denoise(self, model: MMDiTModel, **kwargs) -> Tensor:
+        img = kwargs.pop("img")
+        timesteps = kwargs.pop("timesteps")
+        guidance = kwargs.pop("guidance")
+
+        guidance_vec = torch.full(
+            (img.shape[0],), guidance, device=img.device, dtype=img.dtype
+        )
+        for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+            # timesteps
+            t_vec = torch.full(
+                (img.shape[0],), t_curr, dtype=img.dtype, device=img.device
+            )
+            # forward
+            pred = model(
+                img=img,
+                **kwargs,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+            )
+            # update
+            img = img + (t_prev - t_curr) * pred
+        return img
+
+    def prepare_guidance(
+        self,
+        text: list[str],
+        optional_models: dict[str, nn.Module],
+        device: torch.device,
+        dtype: torch.dtype,
+        **kwargs,
+    ) -> tuple[list[str], dict[str, Tensor]]:
+        return text, {}
+
+
+SamplingMethodDict = {
+    SamplingMethod.I2V: I2VDenoiser(),
+    SamplingMethod.DISTILLED: DistilledDenoiser(),
+}
+
+
+# ======================================================
+# Timesteps
+# ======================================================
+
+
+def time_shift(alpha: float, t: Tensor) -> Tensor:
+    return alpha * t / (1 + (alpha - 1) * t)
+
+
+def get_res_lin_function(
+    x1: float = 256, y1: float = 1, x2: float = 4096, y2: float = 3
+) -> callable:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+
+
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    num_frames: int,
+    shift_alpha: float | None = None,
+    base_shift: float = 1,
+    max_shift: float = 3,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        if shift_alpha is None:
+            # estimate mu based on linear estimation between two points
+            # spatial scale
+            shift_alpha = get_res_lin_function(y1=base_shift, y2=max_shift)(
+                image_seq_len
+            )
+            # temporal scale
+            shift_alpha *= math.sqrt(num_frames)
+        # calculate shifted timesteps
+        timesteps = time_shift(shift_alpha, timesteps)
+
+    return timesteps.tolist()
+
+
+def get_noise(
+    num_samples: int,
+    height: int,
+    width: int,
+    num_frames: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    seed: int,
+    patch_size: int = 2,
+    channel: int = 16,
+) -> Tensor:
+    """
+    Generate a noise tensor.
+
+    Args:
+        num_samples (int): Number of samples.
+        height (int): Height of the noise tensor.
+        width (int): Width of the noise tensor.
+        num_frames (int): Number of frames.
+        device (torch.device): Device to put the noise tensor on.
+        dtype (torch.dtype): Data type of the noise tensor.
+        seed (int): Seed for the random number generator.
+
+    Returns:
+        Tensor: The noise tensor.
+    """
+    D = int(os.environ.get("AE_SPATIAL_COMPRESSION", 16))
+    return torch.randn(
+        num_samples,
+        channel,
+        num_frames,
+        # allow for packing
+        patch_size * math.ceil(height / D),
+        patch_size * math.ceil(width / D),
+        device=device,
+        dtype=dtype,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+
+
+def pack(x: Tensor, patch_size: int = 2) -> Tensor:
+    return rearrange(
+        x, "b c t (h ph) (w pw) -> b (t h w) (c ph pw)", ph=patch_size, pw=patch_size
+    )
+
+
+def unpack(
+    x: Tensor, height: int, width: int, num_frames: int, patch_size: int = 2
+) -> Tensor:
+    D = int(os.environ.get("AE_SPATIAL_COMPRESSION", 16))
+    return rearrange(
+        x,
+        "b (t h w) (c ph pw) -> b c t (h ph) (w pw)",
+        h=math.ceil(height / D),
+        w=math.ceil(width / D),
+        t=num_frames,
+        ph=patch_size,
+        pw=patch_size,
+    )
+
+
+# ======================================================
+# Prepare
+# ======================================================
+
+
+def prepare(
+    t5,
+    clip: HFEmbedder,
+    img: Tensor,
+    prompt: str | list[str],
+    seq_align: int = 1,
+    patch_size: int = 2,
+) -> dict[str, Tensor]:
+    """
+    Prepare the input for the model.
+
+    Args:
+        t5 (HFEmbedder): The T5 model.
+        clip (HFEmbedder): The CLIP model.
+        img (Tensor): The image tensor.
+        prompt (str | list[str]): The prompt(s).
+
+    Returns:
+        dict[str, Tensor]: The input dictionary.
+
+        img_ids: used for positional embedding in T,H,W dimensions later
+        text_ids: for positional embedding, but set to 0 for now since our text encoder already encodes positional information
+    """
+    bs, c, t, h, w = img.shape
+    device, dtype = img.device, img.dtype
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    if bs != len(prompt):
+        bs = len(prompt)
+
+    img = rearrange(
+        img, "b c t (h ph) (w pw) -> b (t h w) (c ph pw)", ph=patch_size, pw=patch_size
+    )
+    if img.shape[0] != bs:
+        img = repeat(img, "b ... -> (repeat b) ...", repeat=bs // img.shape[0])
+
+    img_ids = torch.zeros(t, h // patch_size, w // patch_size, 3)
+    img_ids[..., 0] = img_ids[..., 0] + torch.arange(t)[:, None, None]
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // patch_size)[None, :, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // patch_size)[None, None, :]
+    img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
+
+    # Encode the tokenized prompts
+    txt = t5(prompt, added_tokens=img_ids.shape[1], seq_align=seq_align)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, txt.shape[1], 3)
+
+    vec = clip(prompt)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+
+    return {
+        "img": img,
+        "img_ids": img_ids.to(device, dtype),
+        "txt": txt.to(device, dtype),
+        "txt_ids": txt_ids.to(device, dtype),
+        "y_vec": vec.to(device, dtype),
+    }
+
+
+def prepare_ids(
+    img: Tensor,
+    t5_embedding: Tensor,
+    clip_embedding: Tensor,
+) -> dict[str, Tensor]:
+    """
+    Prepare the input for the model.
+
+    Args:
+        img (Tensor): The image tensor.
+        t5_embedding (Tensor): The T5 embedding.
+        clip_embedding (Tensor): The CLIP embedding.
+
+    Returns:
+        dict[str, Tensor]: The input dictionary.
+
+        img_ids: used for positional embedding in T,H,W dimensions later
+        text_ids: for positional embedding, but set to 0 for now since our text encoder already encodes positional information
+    """
+    bs, c, t, h, w = img.shape
+    device, dtype = img.device, img.dtype
+
+    img = rearrange(img, "b c t (h ph) (w pw) -> b (t h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] != bs:
+        img = repeat(img, "b ... -> (repeat b) ...", repeat=bs // img.shape[0])
+
+    img_ids = torch.zeros(t, h // 2, w // 2, 3)
+    img_ids[..., 0] = img_ids[..., 0] + torch.arange(t)[:, None, None]
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[None, :, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, None, :]
+    img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
+
+    # Encode the tokenized prompts
+    if t5_embedding.shape[0] == 1 and bs > 1:
+        t5_embedding = repeat(t5_embedding, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, t5_embedding.shape[1], 3)
+
+    if clip_embedding.shape[0] == 1 and bs > 1:
+        clip_embedding = repeat(clip_embedding, "1 ... -> bs ...", bs=bs)
+
+    return {
+        "img": img,
+        "img_ids": img_ids.to(device, dtype),
+        "txt": t5_embedding.to(device, dtype),
+        "txt_ids": txt_ids.to(device, dtype),
+        "y_vec": clip_embedding.to(device, dtype),
+    }
+
+
+def prepare_models(
+    cfg: Config,
+    device: torch.device,
+    dtype: torch.dtype,
+    offload_model: bool = False,
+) -> tuple[nn.Module, nn.Module, nn.Module, nn.Module, dict[str, nn.Module]]:
+    """
+    Prepare models for inference.
+
+    Args:
+        cfg (Config): The configuration object.
+        device (torch.device): The device to use.
+        dtype (torch.dtype): The data type to use.
+
+    Returns:
+        tuple[nn.Module, nn.Module, nn.Module, nn.Module, dict[str, nn.Module]]: The models. They are the diffusion model, the autoencoder model, the T5 model, the CLIP model, and the optional models.
+    """
+    model_device = (
+        "cpu" if offload_model and cfg.get("img_flux", None) is not None else device
+    )
+
+    model = build_module(
+        cfg.model, MODELS, device_map=model_device, torch_dtype=dtype
+    ).eval()
+    model_ae = build_module(
+        cfg.ae, MODELS, device_map=model_device, torch_dtype=dtype
+    ).eval()
+    model_t5 = build_module(cfg.t5, MODELS, device_map=device, torch_dtype=dtype).eval()
+    model_clip = build_module(
+        cfg.clip, MODELS, device_map=device, torch_dtype=dtype
+    ).eval()
+    if cfg.get("pretrained_lora_path", None) is not None:
+        model = PeftModel.from_pretrained(
+            model, cfg.pretrained_lora_path, is_trainable=False
+        )
+
+    # optional models
+    optional_models = {}
+    if cfg.get("img_flux", None) is not None:
+        model_img_flux = build_module(
+            cfg.img_flux, MODELS, device_map=device, torch_dtype=dtype
+        ).eval()
+        model_ae_img_flux = build_module(
+            cfg.img_flux_ae, MODELS, device_map=device, torch_dtype=dtype
+        ).eval()
+        optional_models["img_flux"] = model_img_flux
+        optional_models["img_flux_ae"] = model_ae_img_flux
+
+    return model, model_ae, model_t5, model_clip, optional_models
+
+
+def prepare_api(
+    model: nn.Module,
+    model_ae: nn.Module,
+    model_t5: nn.Module,
+    model_clip: nn.Module,
+    optional_models: dict[str, nn.Module],
+) -> callable:
+    """
+    Prepare the API function for inference.
+
+    Args:
+        model (nn.Module): The diffusion model.
+        model_ae (nn.Module): The autoencoder model.
+        model_t5 (nn.Module): The T5 model.
+        model_clip (nn.Module): The CLIP model.
+
+    Returns:
+        callable: The API function for inference.
+    """
+
+    @torch.inference_mode()
+    def api_fn(
+        opt: SamplingOption,
+        cond_type: str = "t2v",
+        seed: int = None,
+        sigma_min: float = 1e-5,
+        text: list[str] = None,
+        neg: list[str] = None,
+        patch_size: int = 2,
+        channel: int = 16,
+        **kwargs,
+    ):
+        """
+        The API function for inference.
+
+        Args:
+            opt (SamplingOption): The sampling options.
+            text (list[str], optional): The text prompts. Defaults to None.
+            neg (list[str], optional): The negative text prompts. Defaults to None.
+
+        Returns:
+            torch.Tensor: The generated images.
+        """
+        device = next(model.parameters()).device
+        dtype = next(model.parameters()).dtype
+
+        # passing seed will overwrite opt seed
+        if seed is None:
+            # random seed if not provided
+            seed = opt.seed if opt.seed is not None else random.randint(0, 2**32 - 1)
+        if opt.is_causal_vae:
+            num_frames = (
+                1
+                if opt.num_frames == 1
+                else (opt.num_frames - 1) // opt.temporal_reduction + 1
+            )
+        else:
+            num_frames = (
+                1 if opt.num_frames == 1 else opt.num_frames // opt.temporal_reduction
+            )
+
+        z = get_noise(
+            len(text),
+            opt.height,
+            opt.width,
+            num_frames,
+            device,
+            dtype,
+            seed,
+            patch_size=patch_size,
+            channel=channel // (patch_size**2),
+        )
+        denoiser = SamplingMethodDict[opt.method]
+
+        # i2v reference conditions
+        references = [None] * len(text)
+        if cond_type != "t2v" and "ref" in kwargs:
+            reference_path_list = kwargs.pop("ref")
+            references = collect_references_batch(
+                reference_path_list,
+                cond_type,
+                model_ae,
+                (opt.height, opt.width),
+                is_causal=opt.is_causal_vae,
+            )
+        elif cond_type != "t2v":
+            print(
+                "your csv file doesn't have a ref column or is not processed properly. will default to cond_type t2v!"
+            )
+            cond_type = "t2v"
+
+        # timestep editing
+        timesteps = get_schedule(
+            opt.num_steps,
+            (z.shape[-1] * z.shape[-2]) // patch_size**2,
+            num_frames,
+            shift=opt.shift,
+            shift_alpha=opt.flow_shift,
+        )
+
+        # prepare classifier-free guidance data (method specific)
+        text, additional_inp = denoiser.prepare_guidance(
+            text=text,
+            optional_models=optional_models,
+            device=device,
+            dtype=dtype,
+            neg=neg,
+            guidance_img=opt.guidance_img,
+        )
+
+        inp = prepare(model_t5, model_clip, z, prompt=text, patch_size=patch_size)
+        inp.update(additional_inp)
+
+        if opt.method in [SamplingMethod.I2V]:
+            # prepare references
+            masks, masked_ref = prepare_inference_condition(
+                z, cond_type, ref_list=references, causal=opt.is_causal_vae
+            )
+            inp["masks"] = masks
+            inp["masked_ref"] = masked_ref
+            inp["sigma_min"] = sigma_min
+
+        x = denoiser.denoise(
+            model,
+            **inp,
+            timesteps=timesteps,
+            guidance=opt.guidance,
+            text_osci=opt.text_osci,
+            image_osci=opt.image_osci,
+            scale_temporal_osci=(
+                opt.scale_temporal_osci and "i2v" in cond_type
+            ),  # don't use temporal osci for v2v or t2v
+            flow_shift=opt.flow_shift,
+            patch_size=patch_size,
+        )
+
+        x = unpack(x, opt.height, opt.width, num_frames, patch_size=patch_size)
+
+        # replace for image condition
+        if cond_type == "i2v_head":
+            x[0, :, :1] = references[0][0]
+        elif cond_type == "i2v_tail":
+            x[0, :, -1:] = references[0][0]
+        elif cond_type == "i2v_loop":
+            x[0, :, :1] = references[0][0]
+            x[0, :, -1:] = references[0][1]
+
+        x = model_ae.decode(x)
+        x = x[:, :, : opt.num_frames]  # image
+
+        # remove the duplicate frames
+        if not opt.is_causal_vae:
+            if cond_type == "i2v_head":
+                pad_len = model_ae.compression[0] - 1
+                x = x[:, :, pad_len:]
+            elif cond_type == "i2v_tail":
+                pad_len = model_ae.compression[0] - 1
+                x = x[:, :, :-pad_len]
+            elif cond_type == "i2v_loop":
+                pad_len = model_ae.compression[0] - 1
+                x = x[:, :, pad_len:-pad_len]
+
+        return x
+
+    return api_fn
diff --git a/opensora/utils/train.py b/opensora/utils/train.py
new file mode 100644
index 0000000..beac7d5
--- /dev/null
+++ b/opensora/utils/train.py
@@ -0,0 +1,458 @@
+import random
+import warnings
+from collections import OrderedDict
+from datetime import timedelta
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from colossalai.booster.plugin import HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.utils import get_current_device
+from einops import rearrange
+from torch import nn
+from torch.optim.lr_scheduler import _LRScheduler
+from tqdm import tqdm
+
+from opensora.acceleration.parallel_states import (
+    set_data_parallel_group,
+    set_sequence_parallel_group,
+    set_tensor_parallel_group,
+)
+from opensora.utils.optimizer import LinearWarmupLR
+
+
+def set_lr(
+    optimizer: torch.optim.Optimizer,
+    lr_scheduler: _LRScheduler,
+    lr: float,
+    initial_lr: float = None,
+):
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+    if isinstance(lr_scheduler, LinearWarmupLR):
+        lr_scheduler.base_lrs = [lr] * len(lr_scheduler.base_lrs)
+        if initial_lr is not None:
+            lr_scheduler.initial_lr = initial_lr
+
+
+def set_warmup_steps(
+    lr_scheduler: _LRScheduler,
+    warmup_steps: int,
+):
+    if isinstance(lr_scheduler, LinearWarmupLR):
+        lr_scheduler.warmup_steps = warmup_steps
+
+
+def set_eps(
+    optimizer: torch.optim.Optimizer,
+    eps: float = None,
+):
+    if eps is not None:
+        for param_group in optimizer.param_groups:
+            param_group["eps"] = eps
+
+
+def setup_device() -> tuple[torch.device, DistCoordinator]:
+    """
+    Setup the device and the distributed coordinator.
+
+    Returns:
+        tuple[torch.device, DistCoordinator]: The device and the distributed coordinator.
+    """
+    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
+    # NOTE: A very large timeout is set to avoid some processes exit early
+    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
+    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+    coordinator = DistCoordinator()
+    device = get_current_device()
+
+    return device, coordinator
+
+
+def create_colossalai_plugin(
+    plugin: str,
+    dtype: str,
+    grad_clip: float,
+    **kwargs,
+) -> LowLevelZeroPlugin | HybridParallelPlugin:
+    """
+    Create a ColossalAI plugin.
+
+    Args:
+        plugin (str): The plugin name.
+        dtype (str): The data type.
+        grad_clip (float): The gradient clip value.
+
+    Returns:
+        LowLevelZeroPlugin |  HybridParallelPlugin: The plugin.
+    """
+    plugin_kwargs = dict(
+        precision=dtype,
+        initial_scale=2**16,
+        max_norm=grad_clip,
+        overlap_allgather=True,
+        cast_inputs=False,
+        reduce_bucket_size_in_m=20,
+    )
+    plugin_kwargs.update(kwargs)
+    sp_size = plugin_kwargs.get("sp_size", 1)
+    if plugin == "zero1" or plugin == "zero2":
+        assert sp_size == 1, "Zero plugin does not support sequence parallelism"
+        stage = 1 if plugin == "zero1" else 2
+        plugin = LowLevelZeroPlugin(
+            stage=stage,
+            **plugin_kwargs,
+        )
+        set_data_parallel_group(dist.group.WORLD)
+    elif plugin == "hybrid":
+        plugin_kwargs["find_unused_parameters"] = True
+        reduce_bucket_size_in_m = plugin_kwargs.pop("reduce_bucket_size_in_m")
+        if "zero_bucket_size_in_m" not in plugin_kwargs:
+            plugin_kwargs["zero_bucket_size_in_m"] = reduce_bucket_size_in_m
+        plugin_kwargs.pop("cast_inputs")
+        plugin_kwargs["enable_metadata_cache"] = False
+
+        custom_policy = plugin_kwargs.pop("custom_policy", None)
+        if custom_policy is not None:
+            custom_policy = custom_policy()
+        plugin = HybridParallelPlugin(
+            custom_policy=custom_policy,
+            **plugin_kwargs,
+        )
+        set_tensor_parallel_group(plugin.tp_group)
+        set_sequence_parallel_group(plugin.sp_group)
+        set_data_parallel_group(plugin.dp_group)
+    else:
+        raise ValueError(f"Unknown plugin {plugin}")
+    return plugin
+
+
+@torch.no_grad()
+def update_ema(
+    ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True
+):
+    """
+    Step the EMA model towards the current model.
+
+    Args:
+        ema_model (torch.nn.Module): The EMA model.
+        model (torch.nn.Module): The current model.
+        optimizer (torch.optim.Optimizer): The optimizer.
+        decay (float): The decay rate.
+        sharded (bool): Whether the model is sharded.
+    """
+    ema_params = OrderedDict(ema_model.named_parameters())
+    model_params = OrderedDict(model.named_parameters())
+
+    for name, param in model_params.items():
+        if name == "pos_embed":
+            continue
+        if not param.requires_grad:
+            continue
+        if not sharded:
+            param_data = param.data
+            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
+        else:
+            if param.data.dtype != torch.float32:
+                param_id = id(param)
+                master_param = optimizer.get_working_to_master_map()[param_id]
+                param_data = master_param.data
+            else:
+                param_data = param.data
+            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
+
+
+def dropout_condition(prob: float, txt: torch.Tensor, null_txt: torch.Tensor) -> torch.Tensor:
+    """
+    Apply dropout to the text tensor.
+
+    Args:
+        prob (float): The dropout probability.
+        txt (torch.Tensor): The text tensor.
+        null_txt (torch.Tensor): The null text tensor.
+
+    Returns:
+        torch.Tensor: The text tensor with dropout applied.
+    """
+    if prob == 0:
+        warnings.warn("Dropout probability is 0, skipping dropout")
+    drop_ids = torch.rand(txt.shape[0], device=txt.device) < prob
+    drop_ids = drop_ids.view((drop_ids.shape[0],) + (1,) * (txt.ndim - 1))
+    new_txt = torch.where(drop_ids, null_txt, txt)
+    return new_txt
+
+
+def prepare_visual_condition_uncausal(
+    x: torch.Tensor, condition_config: dict, model_ae: torch.nn.Module, pad: bool = False
+) -> torch.Tensor:
+    """
+    Prepare the visual condition for the model.
+
+    Args:
+        x: (torch.Tensor): The input video tensor.
+        condition_config (dict): The condition configuration.
+        model_ae (torch.nn.Module): The video encoder module.
+
+    Returns:
+        torch.Tensor: The visual condition tensor.
+    """
+    # x has shape [b, c, t, h, w], where b is the batch size
+    B = x.shape[0]
+    C = model_ae.cfg.latent_channels
+    T, H, W = model_ae.get_latent_size(x.shape[-3:])
+
+    # Initialize masks tensor to match the shape of x, but only the time dimension will be masked
+    masks = torch.zeros(B, 1, T, H, W).to(
+        x.device, x.dtype
+    )  # broadcasting over channel, concat to masked_x with 1 + 16 = 17 channesl
+    # to prevent information leakage, image must be encoded separately and copied to latent
+    latent = torch.zeros(B, C, T, H, W).to(x.device, x.dtype)
+    x_0 = torch.zeros(B, C, T, H, W).to(x.device, x.dtype)
+    if T > 1:  # video
+        # certain v2v conditions not are applicable for short videos
+        if T <= 32 // model_ae.time_compression_ratio:
+            condition_config.pop("v2v_head", None)  # given first 32 frames
+            condition_config.pop("v2v_tail", None)  # given last 32 frames
+            condition_config.pop("v2v_head_easy", None)  # given first 64 frames
+            condition_config.pop("v2v_tail_easy", None)  # given last 64 frames
+        if T <= 64 // model_ae.time_compression_ratio:
+            condition_config.pop("v2v_head_easy", None)  # given first 64 frames
+            condition_config.pop("v2v_tail_easy", None)  # given last 64 frames
+
+        mask_cond_options = list(condition_config.keys())  # list of mask conditions
+        mask_cond_weights = list(condition_config.values())  # corresponding probabilities
+
+        for i in range(B):
+            # Randomly select a mask condition based on the provided probabilities
+            mask_cond = random.choices(mask_cond_options, weights=mask_cond_weights, k=1)[0]
+            # Apply the selected mask condition directly on the masks tensor
+            if mask_cond == "i2v_head":  # NOTE: modify video, mask first latent frame
+                # padded video such that the first latent frame correspond to image only
+                masks[i, :, 0, :, :] = 1
+                if pad:
+                    pad_num = model_ae.time_compression_ratio - 1  # 32 --> new video: 7 + (1+31-7)
+                    padded_x = torch.cat([x[i, :, :1]] * pad_num + [x[i, :, :-pad_num]], dim=1).unsqueeze(0)
+                    x_0[i] = model_ae.encode(padded_x)[0]
+                else:
+                    x_0[i] = model_ae.encode(x[i : i + 1])[0]
+                # condition: encode the image only
+                latent[i, :, :1, :, :] = model_ae.encode(
+                    x[i, :, :1, :, :].unsqueeze(0)
+                )  # since the first dimension of right hand side is singleton, torch auto-ignores it
+            elif mask_cond == "i2v_loop":  # # NOTE: modify video, mask first and last latent frame
+                # pad video such that first and last latent frame correspond to image only
+                masks[i, :, 0, :, :] = 1
+                masks[i, :, -1, :, :] = 1
+                if pad:
+                    pad_num = model_ae.time_compression_ratio - 1
+                    padded_x = torch.cat(
+                        [x[i, :, :1]] * pad_num
+                        + [x[i, :, : -pad_num * 2]]
+                        + [x[i, :, -pad_num * 2 - 1].unsqueeze(1)] * pad_num,
+                        dim=1,
+                    ).unsqueeze(
+                        0
+                    )  # remove the last pad_num * 2 frames from the end of the video
+                    x_0[i] = model_ae.encode(padded_x)[0]
+                    # condition: encode the image only
+                    latent[i, :, :1, :, :] = model_ae.encode(x[i, :, :1, :, :].unsqueeze(0))
+                    latent[i, :, -1:, :, :] = model_ae.encode(x[i, :, -pad_num * 2 - 1, :, :].unsqueeze(1).unsqueeze(0))
+                else:
+                    x_0[i] = model_ae.encode(x[i : i + 1])[0]
+                    latent[i, :, :1, :, :] = model_ae.encode(x[i, :, :1, :, :].unsqueeze(0))
+                    latent[i, :, -1:, :, :] = model_ae.encode(x[i, :, -1:, :, :].unsqueeze(0))
+            elif mask_cond == "i2v_tail":  # mask the last latent frame
+                masks[i, :, -1, :, :] = 1
+                if pad:
+                    pad_num = model_ae.time_compression_ratio - 1
+                    padded_x = torch.cat([x[i, :, pad_num:]] + [x[i, :, -1:]] * pad_num, dim=1).unsqueeze(0)
+                    x_0[i] = model_ae.encode(padded_x)[0]
+                    latent[i, :, -1:, :, :] = model_ae.encode(x[i, :, -pad_num * 2 - 1, :, :].unsqueeze(1).unsqueeze(0))
+                else:
+                    x_0[i] = model_ae.encode(x[i : i + 1])[0]
+                    latent[i, :, -1:, :, :] = model_ae.encode(x[i, :, -1:, :, :].unsqueeze(0))
+            elif mask_cond == "v2v_head":  # mask the first 32 video frames
+                assert T > 32 // model_ae.time_compression_ratio
+                conditioned_t = 32 // model_ae.time_compression_ratio
+                masks[i, :, :conditioned_t, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                latent[i, :, :conditioned_t, :, :] = x_0[i, :, :conditioned_t, :, :]
+            elif mask_cond == "v2v_tail":  # mask the last 32 video frames
+                assert T > 32 // model_ae.time_compression_ratio
+                conditioned_t = 32 // model_ae.time_compression_ratio
+                masks[i, :, -conditioned_t:, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                latent[i, :, -conditioned_t:, :, :] = x_0[i, :, -conditioned_t:, :, :]
+            elif mask_cond == "v2v_head_easy":  # mask the first 64 video frames
+                assert T > 64 // model_ae.time_compression_ratio
+                conditioned_t = 64 // model_ae.time_compression_ratio
+                masks[i, :, :conditioned_t, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                latent[i, :, :conditioned_t, :, :] = x_0[i, :, :conditioned_t, :, :]
+            elif mask_cond == "v2v_tail_easy":  # mask the last 64 video frames
+                assert T > 64 // model_ae.time_compression_ratio
+                conditioned_t = 64 // model_ae.time_compression_ratio
+                masks[i, :, -conditioned_t:, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                latent[i, :, -conditioned_t:, :, :] = x_0[i, :, -conditioned_t:, :, :]
+            # elif mask_cond == "v2v_head":  # mask from the beginning to a random point
+            #     masks[i, :, : random.randint(1, T - 2), :, :] = 1
+            # elif mask_cond == "v2v_tail":  # mask from a random point to the end
+            #     masks[i, :, -random.randint(1, T - 2) :, :, :] = 1
+            else:
+                # "t2v" is the fallback case where no specific condition is specified
+                assert mask_cond == "t2v", f"Unknown mask condition {mask_cond}"
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+    else:  # image
+        x_0 = model_ae.encode(x)  # latent video
+
+    latent = masks * latent  # condition latent
+    # merge the masks and the masked_x into a single tensor
+    cond = torch.cat((masks, latent), dim=1)
+    return x_0, cond
+
+
+def prepare_visual_condition_causal(x: torch.Tensor, condition_config: dict, model_ae: torch.nn.Module) -> torch.Tensor:
+    """
+    Prepare the visual condition for the model.
+
+    Args:
+        x: (torch.Tensor): The input video tensor.
+        condition_config (dict): The condition configuration.
+        model_ae (torch.nn.Module): The video encoder module.
+
+    Returns:
+        torch.Tensor: The visual condition tensor.
+    """
+    # x has shape [b, c, t, h, w], where b is the batch size
+    B = x.shape[0]
+    C = model_ae.cfg.latent_channels
+    T, H, W = model_ae.get_latent_size(x.shape[-3:])
+
+    # Initialize masks tensor to match the shape of x, but only the time dimension will be masked
+    masks = torch.zeros(B, 1, T, H, W).to(
+        x.device, x.dtype
+    )  # broadcasting over channel, concat to masked_x with 1 + 16 = 17 channesl
+    # to prevent information leakage, image must be encoded separately and copied to latent
+    latent = torch.zeros(B, C, T, H, W).to(x.device, x.dtype)
+    x_0 = torch.zeros(B, C, T, H, W).to(x.device, x.dtype)
+    if T > 1:  # video
+        # certain v2v conditions not are applicable for short videos
+        if T <= (32 // model_ae.time_compression_ratio) + 1:
+            condition_config.pop("v2v_head", None)  # given first 33 frames
+            condition_config.pop("v2v_tail", None)  # given last 33 frames
+            condition_config.pop("v2v_head_easy", None)  # given first 65 frames
+            condition_config.pop("v2v_tail_easy", None)  # given last 65 frames
+        if T <= (64 // model_ae.time_compression_ratio) + 1:
+            condition_config.pop("v2v_head_easy", None)  # given first 65 frames
+            condition_config.pop("v2v_tail_easy", None)  # given last 65 frames
+
+        mask_cond_options = list(condition_config.keys())  # list of mask conditions
+        mask_cond_weights = list(condition_config.values())  # corresponding probabilities
+
+        for i in range(B):
+            # Randomly select a mask condition based on the provided probabilities
+            mask_cond = random.choices(mask_cond_options, weights=mask_cond_weights, k=1)[0]
+            # Apply the selected mask condition directly on the masks tensor
+
+            if mask_cond == "i2v_head":  # NOTE: modify video, mask first latent frame
+                masks[i, :, 0, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                # condition: encode the image only
+                latent[i, :, :1, :, :] = model_ae.encode(x[i, :, :1, :, :].unsqueeze(0))
+
+            elif mask_cond == "i2v_loop":  # # NOTE: modify video, mask first and last latent frame
+                # pad video such that first and last latent frame correspond to image only
+                masks[i, :, 0, :, :] = 1
+                masks[i, :, -1, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                # condition: encode the image only
+                latent[i, :, :1, :, :] = model_ae.encode(x[i, :, :1, :, :].unsqueeze(0))
+                latent[i, :, -1:, :, :] = model_ae.encode(x[i, :, -1:, :, :].unsqueeze(0))
+
+            elif mask_cond == "i2v_tail":  # mask the last latent frame
+                masks[i, :, -1, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                # condition: encode the last image only
+                latent[i, :, -1:, :, :] = model_ae.encode(x[i, :, -1:, :, :].unsqueeze(0))
+
+            elif "v2v_head" in mask_cond:  # mask the first 33 video frames
+                ref_t = 33 if not "easy" in mask_cond else 65
+                assert (ref_t - 1) % model_ae.time_compression_ratio == 0
+                conditioned_t = (ref_t - 1) // model_ae.time_compression_ratio + 1
+                masks[i, :, :conditioned_t, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                # encode the first ref_t frame video separately
+                latent[i, :, :conditioned_t, :, :] = model_ae.encode(x[i, :, :ref_t, :, :].unsqueeze(0))
+
+            elif "v2v_tail" in mask_cond:  # mask the last 32 video frames
+                ref_t = 33 if not "easy" in mask_cond else 65
+                assert (ref_t - 1) % model_ae.time_compression_ratio == 0
+                conditioned_t = (ref_t - 1) // model_ae.time_compression_ratio + 1
+                masks[i, :, -conditioned_t:, :, :] = 1
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+                # encode the first ref_t frame video separately
+                latent[i, :, -conditioned_t:, :, :] = model_ae.encode(x[i, :, -ref_t:, :, :].unsqueeze(0))
+            else:
+                # "t2v" is the fallback case where no specific condition is specified
+                assert mask_cond == "t2v", f"Unknown mask condition {mask_cond}"
+                x_0[i] = model_ae.encode(x[i].unsqueeze(0))[0]
+    else:  # image
+        x_0 = model_ae.encode(x)  # latent video
+
+    latent = masks * latent  # condition latent
+    # merge the masks and the masked_x into a single tensor
+    cond = torch.cat((masks, latent), dim=1)
+    return x_0, cond
+
+
+def get_batch_loss(model_pred, v_t, masks=None):
+    # for I2V, only include the generated frames in loss calculation
+    if masks is not None:  # shape [B, T, H, W]
+        num_frames, height, width = masks.shape[-3:]
+        masks = masks[:, :, 0, 0]  # only look at [B, T]
+        model_pred = rearrange(
+            model_pred,
+            "b (t h w) (c ph pw) -> b c t (h ph) (w pw)",
+            h=height // 2,
+            w=width // 2,
+            t=num_frames,
+            ph=2,
+            pw=2,
+        )
+        v_t = rearrange(
+            v_t,
+            "b (t h w) (c ph pw) -> b c t (h ph) (w pw)",
+            h=height // 2,
+            w=width // 2,
+            t=num_frames,
+            ph=2,
+            pw=2,
+        )
+
+        batch_loss = 0
+        for i in range(model_pred.size(0)):
+            pred_val = model_pred[i]
+            target_val = v_t[i]
+            if masks[i][0] == 1 and (not 1 in masks[i][1:-1]):  # have front padding
+                pred_val = pred_val[:, 1:]
+                target_val = target_val[:, 1:]
+            if masks[i][-1] == 1 and (not 1 in masks[i][1:-1]):  # have tail padding
+                pred_val = pred_val[:, :-1]
+                target_val = target_val[:, :-1]
+            batch_loss += F.mse_loss(pred_val.float(), target_val.float(), reduction="mean")
+            # print(f"mask {masks[i]}, pred_val shape: {pred_val.size()}")
+        loss = batch_loss / model_pred.size(0)
+    else:
+        # use reduction mean so that each batch will have same level of influence regardless of batch size
+        loss = F.mse_loss(model_pred.float(), v_t.float(), reduction="mean")
+    return loss
+
+
+@torch.no_grad()
+def warmup_ae(model_ae: nn.Module, shapes: list[tuple[int, ...]], device: torch.device, dtype: torch.dtype):
+    progress_bar = tqdm(shapes, desc="Warmup AE", disable=dist.get_rank() != 0)
+    for x_shape in progress_bar:
+        x = torch.randn(*x_shape, device=device, dtype=dtype)
+        _ = model_ae.encode(x)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b541452
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+torch==2.4.0
+torchvision==0.19.0
+colossalai>=0.4.4
+mmengine>=0.10.3
+ftfy>=6.2.0 # for t5
+accelerate>=0.29.2 # for t5
+av==13.1.0 # for video loading
+liger-kernel==0.5.2
+pandas>=2.0.3
+pandarallel>=1.6.5
+openai>=1.52.2
+wandb>=0.17.0
+tensorboard>=2.14.0
+pre-commit>=3.5.0
+omegaconf>=2.3.0
+pyarrow
diff --git a/requirements/requirements-cu121.txt b/requirements/requirements-cu121.txt
new file mode 100644
index 0000000..362381d
--- /dev/null
+++ b/requirements/requirements-cu121.txt
@@ -0,0 +1,3 @@
+torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121
+torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121
+xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121
diff --git a/requirements/requirements-data.txt b/requirements/requirements-data.txt
new file mode 100644
index 0000000..7a2d38d
--- /dev/null
+++ b/requirements/requirements-data.txt
@@ -0,0 +1,30 @@
+gdown>=5.2.0
+
+# [caption llava]
+ninja>=1.11.1.1
+shortuuid>=1.0.13
+markdown2[all]
+scikit-learn>=1.4.2
+einops-exts>=0.0.4
+
+# [camera_motion]
+decord==0.6.0
+ptvsd==4.3.2
+imageio-ffmpeg>=0.4.9
+
+# [datasets]
+ffmpeg-python==0.2.0
+lingua-language-detector==2.0.2
+
+# [frame interpolation]
+imageio>=2.34.1
+
+# [aesthetic]
+setuptools==68.2.2
+clip @ git+https://github.com/openai/CLIP.git
+
+# [ocr]
+mmcv==2.1.0
+mmdet==3.1.0
+mmocr==1.0.1
+detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992
diff --git a/requirements/requirements-eval.txt b/requirements/requirements-eval.txt
new file mode 100644
index 0000000..6a7ef3a
--- /dev/null
+++ b/requirements/requirements-eval.txt
@@ -0,0 +1,15 @@
+# [vbench]
+detectron2 @ git+https://github.com/facebookresearch/detectron2.git@ff53992
+imageio>=2.34.1
+pyiqa==0.1.10
+scikit-learn>=1.4.2
+scikit-image>=0.20.0
+lvis==0.5.3
+boto3>=1.34.113
+easydict>=1.9
+fairscale>=0.4.13
+
+# [vae]
+decord==0.6.0
+pytorchvideo==0.1.5
+lpips==0.1.4
diff --git a/requirements/requirements-pllava.txt b/requirements/requirements-pllava.txt
new file mode 100644
index 0000000..238dcc4
--- /dev/null
+++ b/requirements/requirements-pllava.txt
@@ -0,0 +1,248 @@
+absl-py==2.1.0
+accelerate==0.26.1
+addict==2.4.0
+aiofiles==23.2.1
+aliyun-python-sdk-core==2.15.0
+aliyun-python-sdk-kms==2.16.2
+altair==5.2.0
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyio==4.3.0
+anykeystore==0.2
+apex==0.9.10.dev0
+appdirs==1.4.4
+argcomplete==3.2.3
+attrs==23.2.0
+av==10.0.0
+beautifulsoup4==4.12.3
+blessed==1.20.0
+blessings==1.7
+boto3==1.34.63
+botocore==1.34.63
+Brotli==1.1.0
+cachetools==5.3.3
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+crcmod==1.7
+cryptacular==1.6.2
+cryptography==42.0.5
+cycler==0.12.1
+dacite==1.7.0
+decorator==4.4.2
+decord==0.6.0
+deepspeed==0.14.0
+defusedxml==0.7.1
+Deprecated==1.2.14
+dill==0.3.8
+distro==1.9.0
+dnspython==2.6.1
+docker-pycreds==0.4.0
+einops==0.6.1
+exceptiongroup==1.2.0
+fastapi==0.110.0
+ffmpeg==1.4
+ffmpy==0.3.2
+fiftyone==0.23.6
+fiftyone-brain==0.16.1
+fiftyone_db==1.1.2
+filelock==3.9.0
+fonttools==4.49.0
+fsspec==2024.2.0
+ftfy==6.1.3
+future==1.0.0
+fvcore==0.1.5.post20221221
+gdown==5.1.0
+gitdb==4.0.11
+GitPython==3.1.42
+glob2==0.7
+google-auth==2.28.2
+google-auth-oauthlib==1.2.0
+gpustat==1.1.1
+gradio==4.21.0
+gradio_client==0.12.0
+graphql-core==3.2.3
+greenlet==3.0.3
+grpcio==1.62.1
+h11==0.14.0
+h2==4.1.0
+hjson==3.1.0
+hpack==4.0.0
+httpcore==1.0.4
+httpx==0.27.0
+huggingface-hub==0.21.4
+humanize==4.9.0
+hupper==1.12.1
+Hypercorn==0.16.0
+hyperframe==6.0.1
+idna==3.6
+idscheck==2.3.0
+imageio==2.27.0
+imageio-ffmpeg==0.4.9
+importlib_metadata==7.0.2
+importlib_resources==6.3.0
+inflate64==1.0.0
+iopath==0.1.10
+Jinja2==3.1.2
+jmespath==0.10.0
+joblib==1.3.2
+jsonlines==4.0.0
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kaleido==0.2.1
+kiwisolver==1.4.5
+lazy_loader==0.3
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.8.3
+mdurl==0.1.2
+mmcv-full==1.7.2
+model-index==0.1.11
+mongoengine==0.24.2
+motor==3.3.2
+moviepy==1.0.3
+mpmath==1.3.0
+multivolumefile==0.2.3
+networkx==3.2.1
+ninja==1.11.1.1
+numpy==1.23.5
+nvidia-ml-py==12.535.133
+nvidia-ml-py3==7.352.0
+oauthlib==3.2.2
+omegaconf==2.3.0
+openai==1.14.0
+opencv-python==4.9.0.80
+opencv-python-headless==4.9.0.80
+opendatalab==0.0.10
+openmim==0.3.9
+openxlab==0.0.36
+ordered-set==4.1.0
+orjson==3.9.15
+oss2==2.17.0
+packaging==24.0
+pandas==1.5.3
+PasteDeploy==3.1.0
+pathtools==0.1.2
+pbkdf2==1.3
+peft==0.10.0
+pillow==10.2.0
+plaster==1.1.2
+plaster-pastedeploy==1.0.1
+platformdirs==4.2.0
+plotly==5.20.0
+portalocker==2.8.2
+pprintpp==0.4.0
+priority==2.0.0
+proglog==0.1.10
+protobuf==4.23.4
+psutil==5.9.4
+py-cpuinfo==9.0.0
+py7zr==0.21.0
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pybcj==1.0.2
+pycparser==2.21
+pycryptodome==3.20.0
+pycryptodomex==3.20.0
+pydantic==2.6.4
+pydantic_core==2.16.3
+pydub==0.25.1
+Pygments==2.17.2
+pymongo==4.6.2
+pynvml==11.5.0
+pyparsing==3.1.2
+pyppmd==1.1.0
+pyramid==2.0.2
+pyramid-mailer==0.15.1
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+python3-openid==3.2.0
+pytz==2023.4
+PyYAML==6.0
+pyzstd==0.15.9
+rarfile==4.1
+referencing==0.33.0
+regex==2023.12.25
+repoze.sendmail==4.4.1
+requests==2.28.2
+requests-oauthlib==1.4.0
+retrying==1.3.4
+rich==13.4.2
+rpds-py==0.18.0
+rsa==4.9
+ruff==0.3.2
+s3transfer==0.10.1
+safetensors==0.4.2
+scikit-image==0.22.0
+scikit-learn==1.4.1.post1
+scipy==1.10.1
+semantic-version==2.10.0
+sentencepiece==0.2.0
+sentry-sdk==1.42.0
+setproctitle==1.3.3
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.5
+SQLAlchemy==2.0.28
+sse-starlette==0.10.3
+sseclient-py==1.8.0
+starlette==0.36.3
+strawberry-graphql==0.138.1
+sympy==1.12
+tabulate==0.9.0
+taskgroup==0.0.0a4
+tenacity==8.2.3
+tensorboard==2.15.1
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.2.2
+termcolor==2.3.0
+texttable==1.7.0
+threadpoolctl==3.3.0
+tifffile==2024.2.12
+timm==0.6.12
+tokenizers==0.15.2
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.2
+torchaudio
+torchvision==0.17.2
+tqdm==4.65.2
+transaction==4.0
+transformers==4.37.1
+translationstring==1.4
+triton==2.2.0
+typer==0.9.0
+typing_extensions==4.8.0
+tzdata==2024.1
+tzlocal==5.2
+universal-analytics-python3==1.1.1
+urllib3==1.26.18
+uvicorn==0.28.0
+velruse==1.1.1
+venusian==3.1.0
+voxel51-eta==0.12.6
+wandb==0.14.0
+wcwidth==0.2.13
+WebOb==1.8.7
+websockets==11.0.3
+Werkzeug==3.0.1
+wrapt==1.16.0
+wsproto==1.2.0
+WTForms==3.1.2
+wtforms-recaptcha==0.3.2
+xmltodict==0.13.0
+yacs==0.1.8
+yapf==0.40.2
+zipp==3.18.1
+zope.deprecation==5.0
+zope.interface==6.2
+zope.sqlalchemy==3.1
diff --git a/requirements/requirements-vae.txt b/requirements/requirements-vae.txt
new file mode 100644
index 0000000..75530e4
--- /dev/null
+++ b/requirements/requirements-vae.txt
@@ -0,0 +1,5 @@
+beartype==0.18.5
+einops==0.8.0
+einops-exts==0.0.4
+opencv-python==4.9.0.80
+pillow==10.3.0
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
new file mode 100644
index 0000000..20a3fa0
--- /dev/null
+++ b/requirements/requirements.txt
@@ -0,0 +1,28 @@
+colossalai>=0.4.1
+mmengine>=0.10.3
+pandas>=2.0.3
+timm==0.9.16
+rotary_embedding_torch==0.5.3
+ftfy>=6.2.0 # for t5
+diffusers==0.29.0 # for vae
+accelerate==0.29.2 # for t5
+av>=12.0.0 # for video loading
+numpy<2.0.0
+
+# [gradio]
+gradio>=4.26.0
+spaces>=0.28.3
+
+# [notebook]
+ipykernel>=6.29.4
+ipywidgets>=8.1.2
+
+# [training]
+wandb>=0.17.0
+tensorboard>=2.14.0
+pandarallel>=1.6.5
+pyarrow>=16.1.0 # for parquet
+
+# [dev]
+pre-commit>=3.5.0
+openai
diff --git a/scripts/cnv/meta.py b/scripts/cnv/meta.py
new file mode 100644
index 0000000..e9d9a5f
--- /dev/null
+++ b/scripts/cnv/meta.py
@@ -0,0 +1,70 @@
+import argparse
+
+import numpy as np
+import pandas as pd
+from pandarallel import pandarallel
+from torchvision.io.video import read_video
+from tqdm import tqdm
+
+
+def set_parallel(num_workers: int = None) -> callable:
+    if num_workers == 0:
+        return lambda x, *args, **kwargs: x.progress_apply(*args, **kwargs)
+    else:
+        if num_workers is not None:
+            pandarallel.initialize(progress_bar=True, nb_workers=num_workers)
+        else:
+            pandarallel.initialize(progress_bar=True)
+        return lambda x, *args, **kwargs: x.parallel_apply(*args, **kwargs)
+
+
+def get_video_info(path: str) -> pd.Series:
+    vframes, _, vinfo = read_video(path, pts_unit="sec", output_format="TCHW")
+    num_frames, C, height, width = vframes.shape
+    fps = round(vinfo["video_fps"], 3)
+    aspect_ratio = height / width if width > 0 else np.nan
+    resolution = height * width
+
+    ret = pd.Series(
+        [height, width, fps, num_frames, aspect_ratio, resolution],
+        index=[
+            "height",
+            "width",
+            "fps",
+            "num_frames",
+            "aspect_ratio",
+            "resolution",
+        ],
+        dtype=object,
+    )
+    return ret
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, required=True, help="Input file path")
+    parser.add_argument("--output", type=str, required=True, help="Output file path")
+    parser.add_argument(
+        "--num_workers", type=int, default=None, help="Number of workers"
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    input_path = args.input
+    output_path = args.output
+    num_workers = args.num_workers
+
+    df = pd.read_csv(input_path)
+    tqdm.pandas()
+    apply = set_parallel(num_workers)
+
+    result = apply(df["path"], get_video_info)
+    for col in result.columns:
+        df[col] = result[col]
+    df.to_csv(output_path, index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/cnv/shard.py b/scripts/cnv/shard.py
new file mode 100644
index 0000000..1162cf9
--- /dev/null
+++ b/scripts/cnv/shard.py
@@ -0,0 +1,74 @@
+import os
+
+import pandas as pd
+from tqdm import tqdm
+
+try:
+    import dask.dataframe as dd
+
+    SUPPORT_DASK = True
+except:
+    SUPPORT_DASK = False
+
+
+def shard_parquet(input_path, k):
+    # 检查输入路径是否存在
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"Input file {input_path} does not exist.")
+
+    # 读取 Parquet 文件为 Pandas DataFrame
+    if SUPPORT_DASK:
+        df = dd.read_parquet(input_path).compute()
+    else:
+        df = pd.read_parquet(input_path)
+
+    # 去除指定的列
+    columns_to_remove = [
+        "num_frames",
+        "height",
+        "width",
+        "aspect_ratio",
+        "fps",
+        "resolution",
+    ]
+    df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])
+
+    # 计算每个分片的大小
+    total_rows = len(df)
+    rows_per_shard = (total_rows + k - 1) // k  # 向上取整
+
+    # 创建与原始文件同名的文件夹
+    base_dir = os.path.dirname(input_path)
+    base_name = os.path.splitext(os.path.basename(input_path))[0]
+    output_dir = os.path.join(base_dir, base_name)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # 创建分片并保存到文件夹
+    for i in tqdm(range(k)):
+        start_idx = i * rows_per_shard
+        end_idx = min(start_idx + rows_per_shard, total_rows)
+
+        shard_df = df.iloc[start_idx:end_idx]
+        if shard_df.empty:
+            continue
+
+        shard_file_name = f"{i + 1:05d}.parquet"
+        shard_path = os.path.join(output_dir, shard_file_name)
+
+        shard_df.to_parquet(shard_path, index=False)
+
+        # print(f"Shard saved to {shard_path}, rows: {len(shard_df)}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Shard a Parquet file.")
+    parser.add_argument("input_path", type=str, help="Path to the input Parquet file.")
+    parser.add_argument(
+        "k", type=int, help="Number of shards to create.", default=10000
+    )
+
+    args = parser.parse_args()
+
+    shard_parquet(args.input_path, args.k)
diff --git a/scripts/diffusion/inference.py b/scripts/diffusion/inference.py
new file mode 100644
index 0000000..221ed2d
--- /dev/null
+++ b/scripts/diffusion/inference.py
@@ -0,0 +1,245 @@
+import os
+import time
+import warnings
+from pprint import pformat
+
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+
+import torch
+import torch.distributed as dist
+from colossalai.utils import set_seed
+from tqdm import tqdm
+
+from opensora.acceleration.parallel_states import get_data_parallel_group
+from opensora.datasets.dataloader import prepare_dataloader
+from opensora.registry import DATASETS, build_module
+from opensora.utils.cai import (
+    get_booster,
+    get_is_saving_process,
+    init_inference_environment,
+)
+from opensora.utils.config import parse_alias, parse_configs
+from opensora.utils.inference import (
+    add_fps_info_to_text,
+    add_motion_score_to_text,
+    create_tmp_csv,
+    modify_option_to_t2i,
+    process_and_save,
+)
+from opensora.utils.logger import create_logger, is_main_process
+from opensora.utils.misc import log_cuda_max_memory, to_torch_dtype
+from opensora.utils.prompt_refine import refine_prompts
+from opensora.utils.sampling import (
+    SamplingOption,
+    prepare_api,
+    prepare_models,
+    sanitize_sampling_option,
+)
+
+
+@torch.inference_mode()
+def main():
+    # ======================================================
+    # 1. configs & runtime variables
+    # ======================================================
+    torch.set_grad_enabled(False)
+
+    # == parse configs ==
+    cfg = parse_configs()
+    cfg = parse_alias(cfg)
+
+    # == device and dtype ==
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
+    seed = cfg.get("seed", 1024)
+    if seed is not None:
+        set_seed(seed)
+
+    # == init distributed env ==
+    init_inference_environment()
+    logger = create_logger()
+    logger.info("Inference configuration:\n %s", pformat(cfg.to_dict()))
+    is_saving_process = get_is_saving_process(cfg)
+    booster = get_booster(cfg)
+    booster_ae = get_booster(cfg, ae=True)
+
+    # ======================================================
+    # 2. build dataset and dataloader
+    # ======================================================
+    logger.info("Building dataset...")
+
+    # save directory
+    save_dir = cfg.save_dir
+    os.makedirs(save_dir, exist_ok=True)
+
+    # == build dataset ==
+    if cfg.get("prompt"):
+        cfg.dataset.data_path = create_tmp_csv(save_dir, cfg.prompt, cfg.get("ref", None), create=is_main_process())
+    dist.barrier()
+    dataset = build_module(cfg.dataset, DATASETS)
+
+    # range selection
+    start_index = cfg.get("start_index", 0)
+    end_index = cfg.get("end_index", None)
+    if end_index is None:
+        end_index = start_index + cfg.get("num_samples", len(dataset.data) + 1)
+    dataset.data = dataset.data[start_index:end_index]
+    logger.info("Dataset contains %s samples.", len(dataset))
+
+    # == build dataloader ==
+    dataloader_args = dict(
+        dataset=dataset,
+        batch_size=cfg.get("batch_size", 1),
+        num_workers=cfg.get("num_workers", 4),
+        seed=cfg.get("seed", 1024),
+        shuffle=False,
+        drop_last=False,
+        pin_memory=True,
+        process_group=get_data_parallel_group(),
+        prefetch_factor=cfg.get("prefetch_factor", None),
+    )
+    dataloader, _ = prepare_dataloader(**dataloader_args)
+
+    # == prepare default params ==
+    sampling_option = SamplingOption(**cfg.sampling_option)
+    sampling_option = sanitize_sampling_option(sampling_option)
+
+    cond_type = cfg.get("cond_type", "t2v")
+    prompt_refine = cfg.get("prompt_refine", False)
+    fps_save = cfg.get("fps_save", 16)
+    num_sample = cfg.get("num_sample", 1)
+
+    type_name = "image" if cfg.sampling_option.num_frames == 1 else "video"
+    sub_dir = f"{type_name}_{cfg.sampling_option.resolution}"
+    os.makedirs(os.path.join(save_dir, sub_dir), exist_ok=True)
+    use_t2i2v = cfg.get("use_t2i2v", False)
+    img_sub_dir = os.path.join(sub_dir, "generated_condition")
+    if use_t2i2v:
+        os.makedirs(os.path.join(save_dir, sub_dir, "generated_condition"), exist_ok=True)
+
+    # ======================================================
+    # 3. build model
+    # ======================================================
+    logger.info("Building models...")
+
+    # == build flux model ==
+    model, model_ae, model_t5, model_clip, optional_models = prepare_models(
+        cfg, device, dtype, offload_model=cfg.get("offload_model", False)
+    )
+    log_cuda_max_memory("build model")
+
+    if booster:
+        model, _, _, _, _ = booster.boost(model=model)
+        model = model.unwrap()
+    if booster_ae:
+        model_ae, _, _, _, _ = booster_ae.boost(model=model_ae)
+        model_ae = model_ae.unwrap()
+
+    api_fn = prepare_api(model, model_ae, model_t5, model_clip, optional_models)
+
+    # prepare image flux model if t2i2v
+    if use_t2i2v:
+        api_fn_img = prepare_api(
+            optional_models["img_flux"], optional_models["img_flux_ae"], model_t5, model_clip, optional_models
+        )
+
+    # ======================================================
+    # 4. inference
+    # ======================================================
+    for epoch in range(num_sample):  # generate multiple samples with different seeds
+        dataloader_iter = iter(dataloader)
+        with tqdm(
+            enumerate(dataloader_iter, start=0),
+            desc="Inference progress",
+            disable=not is_main_process(),
+            initial=0,
+            total=len(dataloader),
+        ) as pbar:
+            for _, batch in pbar:
+                original_text = batch.pop("text")
+                if use_t2i2v:
+                    batch["text"] = original_text if not prompt_refine else refine_prompts(original_text, type="t2i")
+                    sampling_option_t2i = modify_option_to_t2i(
+                        sampling_option,
+                        distilled=True,
+                        img_resolution=cfg.get("img_resolution", "768px"),
+                    )
+                    if cfg.get("offload_model", False):
+                        model_move_start = time.time()
+                        model = model.to("cpu", dtype)
+                        model_ae = model_ae.to("cpu", dtype)
+                        optional_models["img_flux"].to(device, dtype)
+                        optional_models["img_flux_ae"].to(device, dtype)
+                        logger.info(
+                            "offload video diffusion model to cpu, load image flux model to gpu: %s s",
+                            time.time() - model_move_start,
+                        )
+
+                    logger.info("Generating image condition by flux...")
+                    x_cond = api_fn_img(
+                        sampling_option_t2i,
+                        "t2v",
+                        seed=sampling_option.seed + epoch if sampling_option.seed else None,
+                        channel=cfg["img_flux"]["in_channels"],
+                        **batch,
+                    ).cpu()
+
+                    # save image to disk
+                    batch["name"] = process_and_save(
+                        x_cond,
+                        batch,
+                        cfg,
+                        img_sub_dir,
+                        sampling_option_t2i,
+                        epoch,
+                        start_index,
+                        saving=is_saving_process,
+                    )
+                    dist.barrier()
+
+                    if cfg.get("offload_model", False):
+                        model_move_start = time.time()
+                        model = model.to(device, dtype)
+                        model_ae = model_ae.to(device, dtype)
+                        optional_models["img_flux"].to("cpu", dtype)
+                        optional_models["img_flux_ae"].to("cpu", dtype)
+                        logger.info(
+                            "load video diffusion model to gpu, offload image flux model to cpu: %s s",
+                            time.time() - model_move_start,
+                        )
+
+                    ref_dir = os.path.join(save_dir, os.path.join(sub_dir, "generated_condition"))
+                    batch["ref"] = [os.path.join(ref_dir, f"{x}.png") for x in batch["name"]]
+                    cond_type = "i2v_head"
+
+                batch["text"] = original_text
+                if prompt_refine:
+                    batch["text"] = refine_prompts(
+                        original_text, type="t2v" if cond_type == "t2v" else "t2i", image_paths=batch.get("ref", None)
+                    )
+                batch["text"] = add_fps_info_to_text(batch.pop("text"), fps=fps_save)
+                if "motion_score" in cfg:
+                    batch["text"] = add_motion_score_to_text(batch.pop("text"), cfg.get("motion_score", 5))
+
+                logger.info("Generating video...")
+                x = api_fn(
+                    sampling_option,
+                    cond_type,
+                    seed=sampling_option.seed + epoch if sampling_option.seed else None,
+                    patch_size=cfg.get("patch_size", 2),
+                    save_prefix=cfg.get("save_prefix", ""),
+                    channel=cfg["model"]["in_channels"],
+                    **batch,
+                ).cpu()
+
+                if is_saving_process:
+                    process_and_save(x, batch, cfg, sub_dir, sampling_option, epoch, start_index)
+                dist.barrier()
+
+    logger.info("Inference finished.")
+    log_cuda_max_memory("inference")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/diffusion/train.py b/scripts/diffusion/train.py
new file mode 100644
index 0000000..8f8a09c
--- /dev/null
+++ b/scripts/diffusion/train.py
@@ -0,0 +1,654 @@
+import gc
+import math
+import os
+import subprocess
+import warnings
+from contextlib import nullcontext
+from copy import deepcopy
+from pprint import pformat
+
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+gc.disable()
+
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import wandb
+from colossalai.booster import Booster
+from colossalai.utils import set_seed
+from peft import LoraConfig
+from tqdm import tqdm
+
+from opensora.acceleration.checkpoint import (
+    GLOBAL_ACTIVATION_MANAGER,
+    set_grad_checkpoint,
+)
+from opensora.acceleration.parallel_states import get_data_parallel_group
+from opensora.datasets.aspect import bucket_to_shapes
+from opensora.datasets.dataloader import prepare_dataloader
+from opensora.datasets.pin_memory_cache import PinMemoryCache
+from opensora.models.mmdit.distributed import MMDiTPolicy
+from opensora.registry import DATASETS, MODELS, build_module
+from opensora.utils.ckpt import (
+    CheckpointIO,
+    model_sharding,
+    record_model_param_shape,
+    rm_checkpoints,
+)
+from opensora.utils.config import (
+    config_to_name,
+    create_experiment_workspace,
+    parse_configs,
+)
+from opensora.utils.logger import create_logger
+from opensora.utils.misc import (
+    NsysProfiler,
+    Timers,
+    all_reduce_mean,
+    create_tensorboard_writer,
+    is_log_process,
+    is_pipeline_enabled,
+    log_cuda_max_memory,
+    log_cuda_memory,
+    log_model_params,
+    print_mem,
+    to_torch_dtype,
+)
+from opensora.utils.optimizer import create_lr_scheduler, create_optimizer
+from opensora.utils.sampling import (
+    get_res_lin_function,
+    pack,
+    prepare,
+    prepare_ids,
+    time_shift,
+)
+from opensora.utils.train import (
+    create_colossalai_plugin,
+    dropout_condition,
+    get_batch_loss,
+    prepare_visual_condition_causal,
+    prepare_visual_condition_uncausal,
+    set_eps,
+    set_lr,
+    setup_device,
+    update_ema,
+    warmup_ae,
+)
+
+torch.backends.cudnn.benchmark = False  # True leads to slow down in conv3d
+
+
+def main():
+    # ======================================================
+    # 1. configs & runtime variables
+    # ======================================================
+    # == parse configs ==
+    cfg = parse_configs()
+
+    # == get dtype & device ==
+    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
+    device, coordinator = setup_device()
+    grad_ckpt_buffer_size = cfg.get("grad_ckpt_buffer_size", 0)
+    if grad_ckpt_buffer_size > 0:
+        GLOBAL_ACTIVATION_MANAGER.setup_buffer(grad_ckpt_buffer_size, dtype)
+    checkpoint_io = CheckpointIO()
+    set_seed(cfg.get("seed", 1024))
+    PinMemoryCache.force_dtype = dtype
+    pin_memory_cache_pre_alloc_numels = cfg.get("pin_memory_cache_pre_alloc_numels", None)
+    PinMemoryCache.pre_alloc_numels = pin_memory_cache_pre_alloc_numels
+
+    # == init ColossalAI booster ==
+    plugin_type = cfg.get("plugin", "zero2")
+    plugin_config = cfg.get("plugin_config", {})
+    plugin_kwargs = {}
+    if plugin_type == "hybrid":
+        plugin_kwargs["custom_policy"] = MMDiTPolicy
+    plugin = create_colossalai_plugin(
+        plugin=plugin_type,
+        dtype=cfg.get("dtype", "bf16"),
+        grad_clip=cfg.get("grad_clip", 0),
+        **plugin_config,
+        **plugin_kwargs,
+    )
+    booster = Booster(plugin=plugin)
+
+    seq_align = plugin_config.get("sp_size", 1)
+
+    # == init exp_dir ==
+    exp_name, exp_dir = create_experiment_workspace(
+        cfg.get("outputs", "./outputs"),
+        model_name=config_to_name(cfg),
+        config=cfg.to_dict(),
+        exp_name=cfg.get("exp_name", None),  # useful for automatic restart to specify the exp_name
+    )
+
+    if is_log_process(plugin_type, plugin_config):
+        print(f"changing {exp_dir} to share")
+        os.system(f"chgrp -R share {exp_dir}")
+
+    # == init logger, tensorboard & wandb ==
+    logger = create_logger(exp_dir)
+    logger.info("Training configuration:\n %s", pformat(cfg.to_dict()))
+    tb_writer = None
+    if coordinator.is_master():
+        tb_writer = create_tensorboard_writer(exp_dir)
+        if cfg.get("wandb", False):
+            wandb.init(
+                project=cfg.get("wandb_project", "Open-Sora"),
+                name=exp_name,
+                config=cfg.to_dict(),
+                dir=exp_dir,
+            )
+    num_gpus = dist.get_world_size() if dist.is_initialized() else 1
+    tp_size = cfg["plugin_config"].get("tp_size", 1)
+    sp_size = cfg["plugin_config"].get("sp_size", 1)
+    pp_size = cfg["plugin_config"].get("pp_size", 1)
+    num_groups = num_gpus // (tp_size * sp_size * pp_size)
+    logger.info("Number of GPUs: %s", num_gpus)
+    logger.info("Number of groups: %s", num_groups)
+
+    # ======================================================
+    # 2. build dataset and dataloader
+    # ======================================================
+    logger.info("Building dataset...")
+    # == build dataset ==
+    dataset = build_module(cfg.dataset, DATASETS)
+    logger.info("Dataset contains %s samples.", len(dataset))
+
+    # == build dataloader ==
+    cache_pin_memory = pin_memory_cache_pre_alloc_numels is not None
+    dataloader_args = dict(
+        dataset=dataset,
+        batch_size=cfg.get("batch_size", None),
+        num_workers=cfg.get("num_workers", 4),
+        seed=cfg.get("seed", 1024),
+        shuffle=True,
+        drop_last=True,
+        pin_memory=True,
+        process_group=get_data_parallel_group(),
+        prefetch_factor=cfg.get("prefetch_factor", None),
+        cache_pin_memory=cache_pin_memory,
+        num_groups=num_groups,
+    )
+    print_mem("before prepare_dataloader")
+    dataloader, sampler = prepare_dataloader(
+        bucket_config=cfg.get("bucket_config", None),
+        num_bucket_build_workers=cfg.get("num_bucket_build_workers", 1),
+        **dataloader_args,
+    )
+    print_mem("after prepare_dataloader")
+    num_steps_per_epoch = len(dataloader)
+    dataset.to_efficient()
+
+    # ======================================================
+    # 3. build model
+    # ======================================================
+    logger.info("Building models...")
+
+    # == build model model ==
+    model = build_module(cfg.model, MODELS, device_map=device, torch_dtype=dtype).train()
+    if cfg.get("grad_checkpoint", True):
+        set_grad_checkpoint(model)
+    log_cuda_memory("diffusion")
+    log_model_params(model)
+
+    # == build EMA model ==
+    use_lora = cfg.get("lora_config", None) is not None
+    if cfg.get("ema_decay", None) is not None and not use_lora:
+        ema = deepcopy(model).cpu().eval().requires_grad_(False)
+        ema_shape_dict = record_model_param_shape(ema)
+        logger.info("EMA model created.")
+    else:
+        ema = ema_shape_dict = None
+        logger.info("No EMA model created.")
+    log_cuda_memory("EMA")
+
+    # == enable LoRA ==
+    if use_lora:
+        lora_config = LoraConfig(**cfg.get("lora_config", None))
+        model = booster.enable_lora(
+            model=model,
+            lora_config=lora_config,
+            pretrained_dir=cfg.get("lora_checkpoint", None),
+        )
+        log_cuda_memory("lora")
+        log_model_params(model)
+
+    if not cfg.get("cached_video", False):
+        # == buildn autoencoder ==
+        model_ae = build_module(cfg.ae, MODELS, device_map=device, torch_dtype=dtype).eval().requires_grad_(False)
+        del model_ae.decoder
+        log_cuda_memory("autoencoder")
+        log_model_params(model_ae)
+        model_ae.encode = torch.compile(model_ae.encoder, dynamic=True)
+
+    if not cfg.get("cached_text", False):
+        # == build text encoder (t5) ==
+        model_t5 = build_module(cfg.t5, MODELS, device_map=device, torch_dtype=dtype).eval().requires_grad_(False)
+        log_cuda_memory("t5")
+        log_model_params(model_t5)
+
+        # == build text encoder (clip) ==
+        model_clip = build_module(cfg.clip, MODELS, device_map=device, torch_dtype=dtype).eval().requires_grad_(False)
+        log_cuda_memory("clip")
+        log_model_params(model_clip)
+
+    # == setup optimizer ==
+    optimizer = create_optimizer(model, cfg.optim)
+
+    # == setup lr scheduler ==
+    lr_scheduler = create_lr_scheduler(
+        optimizer=optimizer,
+        num_steps_per_epoch=num_steps_per_epoch,
+        epochs=cfg.get("epochs", 1000),
+        warmup_steps=cfg.get("warmup_steps", None),
+        use_cosine_scheduler=cfg.get("use_cosine_scheduler", False),
+    )
+    log_cuda_memory("optimizer")
+
+    # == prepare null vectors for dropout ==
+    if cfg.get("cached_text", False):
+        null_txt = torch.load("/mnt/ddn/sora/tmp_load/null_t5.pt", map_location=device)
+        null_vec = torch.load("/mnt/ddn/sora/tmp_load/null_clip.pt", map_location=device)
+    else:
+        null_txt = model_t5("")
+        null_vec = model_clip("")
+
+    # =======================================================
+    # 4. distributed training preparation with colossalai
+    # =======================================================
+    logger.info("Preparing for distributed training...")
+    # == boosting ==
+    torch.set_default_dtype(dtype)
+    model, optimizer, _, dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        dataloader=dataloader,
+    )
+    torch.set_default_dtype(torch.float)
+    logger.info("Boosted model for distributed training")
+    log_cuda_memory("boost")
+
+    # == global variables ==
+    cfg_epochs = cfg.get("epochs", 1000)
+    log_step = acc_step = 0
+    running_loss = 0.0
+    timers = Timers(record_time=cfg.get("record_time", False), record_barrier=cfg.get("record_barrier", False))
+    nsys = NsysProfiler(
+        warmup_steps=cfg.get("nsys_warmup_steps", 2),
+        num_steps=cfg.get("nsys_num_steps", 2),
+        enabled=cfg.get("nsys", False),
+    )
+    logger.info("Training for %s epochs with %s steps per epoch", cfg_epochs, num_steps_per_epoch)
+
+    # == resume ==
+    load_master_weights = cfg.get("load_master_weights", False)
+    save_master_weights = cfg.get("save_master_weights", False)
+    start_epoch = cfg.get("start_epoch", None)
+    start_step = cfg.get("start_step", None)
+    if cfg.get("load", None) is not None:
+        logger.info("Loading checkpoint from %s", cfg.load)
+
+        lr_scheduler_to_load = lr_scheduler
+        if cfg.get("update_warmup_steps", False):
+            lr_scheduler_to_load = None
+        ret = checkpoint_io.load(
+            booster,
+            cfg.load,
+            model=model,
+            ema=ema,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler_to_load,
+            sampler=(
+                None if start_step is not None else sampler
+            ),  # if specify start step, set last_micro_batch_access_index of a new sampler instead
+            include_master_weights=load_master_weights,
+        )
+        start_epoch = start_epoch if start_epoch is not None else ret[0]
+        start_step = start_step if start_step is not None else ret[1]
+        logger.info("Loaded checkpoint %s at epoch %s step %s", cfg.load, ret[0], ret[1])
+
+        # load optimizer and scheduler will overwrite some of the hyperparameters, so we need to reset them
+        set_lr(optimizer, lr_scheduler, cfg.optim.lr, cfg.get("initial_lr", None))
+        set_eps(optimizer, cfg.optim.eps)
+
+        if cfg.get("update_warmup_steps", False):
+            assert (
+                cfg.get("warmup_steps", None) is not None
+            ), "you need to set warmup_steps in order to pass --update-warmup-steps True"
+            # set_warmup_steps(lr_scheduler, cfg.warmup_steps)
+            lr_scheduler.step(start_epoch * num_steps_per_epoch + start_step)
+            logger.info("The learning rate starts from %s", optimizer.param_groups[0]["lr"])
+    if start_step is not None:
+        # if start step exceeds data length, go to next epoch
+        if start_step > num_steps_per_epoch:
+            start_epoch = (
+                start_epoch + start_step // num_steps_per_epoch
+                if start_epoch is not None
+                else start_step // num_steps_per_epoch
+            )
+            start_step = start_step % num_steps_per_epoch
+    else:
+        start_step = 0
+    sampler.set_step(start_step)
+    start_epoch = start_epoch if start_epoch is not None else 0
+    logger.info("Starting from epoch %s step %s", start_epoch, start_step)
+
+    # == sharding EMA model ==
+    if ema is not None:
+        model_sharding(ema)
+        ema = ema.to(device)
+        log_cuda_memory("sharding EMA")
+
+    # == warmup autoencoder ==
+    if cfg.get("warmup_ae", False):
+        shapes = bucket_to_shapes(cfg.get("bucket_config", None), batch_size=cfg.ae.batch_size)
+        warmup_ae(model_ae, shapes, device, dtype)
+
+    # =======================================================
+    # 5. training iter
+    # =======================================================
+    sigma_min = cfg.get("sigma_min", 1e-5)
+    accumulation_steps = cfg.get("accumulation_steps", 1)
+    ckpt_every = cfg.get("ckpt_every", 0)
+
+    if cfg.get("is_causal_vae", False):
+        prepare_visual_condition = prepare_visual_condition_causal
+    else:
+        prepare_visual_condition = prepare_visual_condition_uncausal
+
+    @torch.no_grad()
+    def prepare_inputs(batch):
+        inp = dict()
+        x = batch.pop("video")
+        y = batch.pop("text")
+        bs = x.shape[0]
+
+        # == encode video ==
+        with nsys.range("encode_video"), timers["encode_video"]:
+            # == prepare condition ==
+            if cfg.get("condition_config", None) is not None:
+                # condition for i2v & v2v
+                x_0, cond = prepare_visual_condition(x, cfg.condition_config, model_ae)
+                cond = pack(cond, patch_size=cfg.get("patch_size", 2))
+                inp["cond"] = cond
+            else:
+                if cfg.get("cached_video", False):
+                    x_0 = batch.pop("video_latents").to(device=device, dtype=dtype)
+                else:
+                    x_0 = model_ae.encode(x)
+
+        # == prepare timestep ==
+        # follow SD3 time shift, shift_alpha = 1 for 256px and shift_alpha = 3 for 1024px
+        shift_alpha = get_res_lin_function()((x_0.shape[-1] * x_0.shape[-2]) // 4)
+        # add temporal influence
+        shift_alpha *= math.sqrt(x_0.shape[-3])  # for image, T=1 so no effect
+        t = torch.sigmoid(torch.randn((bs), device=device))
+        t = time_shift(shift_alpha, t).to(dtype)
+
+        if cfg.get("cached_text", False):
+            # == encode text ==
+            t5_embedding = batch.pop("text_t5").to(device=device, dtype=dtype)
+            clip_embedding = batch.pop("text_clip").to(device=device, dtype=dtype)
+            with nsys.range("encode_text"), timers["encode_text"]:
+                inp_ = prepare_ids(x_0, t5_embedding, clip_embedding)
+                inp.update(inp_)
+                x_0 = pack(x_0, patch_size=cfg.get("patch_size", 2))
+        else:
+            # == encode text ==
+            with nsys.range("encode_text"), timers["encode_text"]:
+                inp_ = prepare(
+                    model_t5,
+                    model_clip,
+                    x_0,
+                    prompt=y,
+                    seq_align=seq_align,
+                    patch_size=cfg.get("patch_size", 2),
+                )
+                inp.update(inp_)
+                x_0 = pack(x_0, patch_size=cfg.get("patch_size", 2))
+
+        # == dropout ==
+        if cfg.get("dropout_ratio", None) is not None:
+            cur_null_txt = null_txt
+            num_pad_null_txt = inp["txt"].shape[1] - cur_null_txt.shape[1]
+            if num_pad_null_txt > 0:
+                cur_null_txt = torch.cat([cur_null_txt] + [cur_null_txt[:, -1:]] * num_pad_null_txt, dim=1)
+            inp["txt"] = dropout_condition(
+                cfg.dropout_ratio.get("t5", 0.0),
+                inp["txt"],
+                cur_null_txt,
+            )
+            inp["y_vec"] = dropout_condition(
+                cfg.dropout_ratio.get("clip", 0.0),
+                inp["y_vec"],
+                null_vec,
+            )
+
+        # == prepare noise vector ==
+        x_1 = torch.randn_like(x_0, dtype=torch.float32).to(device, dtype)
+        t_rev = 1 - t
+        x_t = t_rev[:, None, None] * x_0 + (1 - (1 - sigma_min) * t_rev[:, None, None]) * x_1
+        inp["img"] = x_t
+        inp["timesteps"] = t.to(dtype)
+        inp["guidance"] = torch.full((x_t.shape[0],), cfg.get("guidance", 4), device=x_t.device, dtype=x_t.dtype)
+
+        return inp, x_0, x_1
+
+    def run_iter(inp, x_0, x_1):
+        if is_pipeline_enabled(plugin_type, plugin_config):
+            inp["target"] = (1 - sigma_min) * x_1 - x_0  # follow MovieGen, modify V_t accordingly
+            with nsys.range("forward-backward"), timers["forward-backward"]:
+                data_iter = iter([inp])
+                if cfg.get("no_i2v_ref_loss", False):
+                    loss_fn = (
+                        lambda out, input_: get_batch_loss(out, input_["target"], input_.pop("masks", None))
+                        / accumulation_steps
+                    )
+                else:
+                    loss_fn = (
+                        lambda out, input_: F.mse_loss(out.float(), input_["target"].float(), reduction="mean")
+                        / accumulation_steps
+                    )
+                loss = booster.execute_pipeline(data_iter, model, loss_fn, optimizer)["loss"]
+                loss = loss * accumulation_steps if loss is not None else loss
+                loss_item = all_reduce_mean(loss.data.clone().detach())
+        else:
+            with nsys.range("forward"), timers["forward"]:
+                model_pred = model(**inp)  # B, T, L
+                v_t = (1 - sigma_min) * x_1 - x_0
+                if cfg.get("no_i2v_ref_loss", False):
+                    loss = get_batch_loss(model_pred, v_t, inp.pop("masks", None))
+                else:
+                    loss = F.mse_loss(model_pred.float(), v_t.float(), reduction="mean")
+
+            loss_item = all_reduce_mean(loss.data.clone().detach()).item()
+
+            # == backward & update ==
+            dist.barrier()
+            with nsys.range("backward"), timers["backward"]:
+                ctx = (
+                    booster.no_sync(model, optimizer)
+                    if cfg.get("plugin", "zero2") in ("zero1", "zero1-seq") and (step + 1) % accumulation_steps != 0
+                    else nullcontext()
+                )
+                with ctx:
+                    booster.backward(loss=(loss / accumulation_steps), optimizer=optimizer)
+
+        with nsys.range("optim"), timers["optim"]:
+            if (step + 1) % accumulation_steps == 0:
+                booster.checkpoint_io.synchronize()
+                optimizer.step()
+                optimizer.zero_grad()
+            if lr_scheduler is not None:
+                lr_scheduler.step()
+
+        # == update EMA ==
+        if ema is not None:
+            with nsys.range("update_ema"), timers["update_ema"]:
+                update_ema(
+                    ema,
+                    model.unwrap(),
+                    optimizer=optimizer,
+                    decay=cfg.get("ema_decay", 0.9999),
+                )
+
+        return loss_item
+
+    # =======================================================
+    # 6. training loop
+    # =======================================================
+    dist.barrier()
+    for epoch in range(start_epoch, cfg_epochs):
+        # == set dataloader to new epoch ==
+        sampler.set_epoch(epoch)
+        dataloader_iter = iter(dataloader)
+        logger.info("Beginning epoch %s...", epoch)
+
+        # == training loop in an epoch ==
+        with tqdm(
+            enumerate(dataloader_iter, start=start_step),
+            desc=f"Epoch {epoch}",
+            disable=not is_log_process(plugin_type, plugin_config),
+            initial=start_step,
+            total=num_steps_per_epoch,
+        ) as pbar:
+            pbar_iter = iter(pbar)
+
+            # prefetch one for non-blocking data loading
+            def fetch_data():
+                step, batch = next(pbar_iter)
+                # print(f"==debug== rank{dist.get_rank()} {dataloader_iter.get_cache_info()}")
+                pinned_video = batch["video"]
+                batch["video"] = pinned_video.to(device, dtype, non_blocking=True)
+                return batch, step, pinned_video
+
+            batch_, step_, pinned_video_ = fetch_data()
+
+            for _ in range(start_step, num_steps_per_epoch):
+                nsys.step()
+                # == load data ===
+                with nsys.range("load_data"), timers["load_data"]:
+                    batch, step, pinned_video = batch_, step_, pinned_video_
+
+                    if step + 1 < num_steps_per_epoch:
+                        # only fetch new data if not last step
+                        batch_, step_, pinned_video_ = fetch_data()
+
+                # == run iter ==
+                with nsys.range("iter"), timers["iter"]:
+                    inp, x_0, x_1 = prepare_inputs(batch)
+                    if cache_pin_memory:
+                        dataloader_iter.remove_cache(pinned_video)
+                    loss = run_iter(inp, x_0, x_1)
+
+                # == update log info ==
+                if loss is not None:
+                    running_loss += loss
+
+                # == log config ==
+                global_step = epoch * num_steps_per_epoch + step
+                actual_update_step = (global_step + 1) // accumulation_steps
+                log_step += 1
+                acc_step += 1
+
+                # == logging ==
+                if (global_step + 1) % accumulation_steps == 0:
+                    if actual_update_step % cfg.get("log_every", 1) == 0:
+                        if is_log_process(plugin_type, plugin_config):
+                            avg_loss = running_loss / log_step
+                            # progress bar
+                            pbar.set_postfix(
+                                {
+                                    "loss": avg_loss,
+                                    "global_grad_norm": optimizer.get_grad_norm(),
+                                    "step": step,
+                                    "global_step": global_step,
+                                    # "actual_update_step": actual_update_step,
+                                    "lr": optimizer.param_groups[0]["lr"],
+                                }
+                            )
+                            # tensorboard
+                            if tb_writer is not None:
+                                tb_writer.add_scalar("loss", loss, actual_update_step)
+                            # wandb
+                            if cfg.get("wandb", False):
+                                wandb_dict = {
+                                    "iter": global_step,
+                                    "acc_step": acc_step,
+                                    "epoch": epoch,
+                                    "loss": loss,
+                                    "avg_loss": avg_loss,
+                                    "lr": optimizer.param_groups[0]["lr"],
+                                    "eps": optimizer.param_groups[0]["eps"],
+                                    "global_grad_norm": optimizer.get_grad_norm(),  # test grad norm
+                                }
+                                if cfg.get("record_time", False):
+                                    wandb_dict.update(timers.to_dict())
+                                wandb.log(wandb_dict, step=actual_update_step)
+
+                        running_loss = 0.0
+                        log_step = 0
+
+                # == checkpoint saving ==
+                # uncomment below 3 lines to forcely clean cache
+                with nsys.range("clean_cache"), timers["clean_cache"]:
+                    if ckpt_every > 0 and actual_update_step % ckpt_every == 0 and coordinator.is_master():
+                        subprocess.run("sudo drop_cache", shell=True)
+
+                with nsys.range("checkpoint"), timers["checkpoint"]:
+                    if ckpt_every > 0 and actual_update_step % ckpt_every == 0:
+                        # mannual garbage collection
+                        gc.collect()
+
+                        save_dir = checkpoint_io.save(
+                            booster,
+                            exp_dir,
+                            model=model,
+                            ema=ema,
+                            optimizer=optimizer,
+                            lr_scheduler=lr_scheduler,
+                            sampler=sampler,
+                            epoch=epoch,
+                            step=step + 1,
+                            global_step=global_step + 1,
+                            batch_size=cfg.get("batch_size", None),
+                            lora=use_lora,
+                            actual_update_step=actual_update_step,
+                            ema_shape_dict=ema_shape_dict,
+                            async_io=cfg.get("async_io", False),
+                            include_master_weights=save_master_weights,
+                        )
+
+                        if is_log_process(plugin_type, plugin_config):
+                            os.system(f"chgrp -R share {save_dir}")
+
+                        logger.info(
+                            "Saved checkpoint at epoch %s, step %s, global_step %s to %s",
+                            epoch,
+                            step + 1,
+                            actual_update_step,
+                            save_dir,
+                        )
+
+                        # remove old checkpoints
+                        rm_checkpoints(exp_dir, keep_n_latest=cfg.get("keep_n_latest", -1))
+                        logger.info("Removed old checkpoints and kept %s latest ones.", cfg.get("keep_n_latest", -1))
+                # uncomment below 3 lines to benchmark checkpoint
+                # if ckpt_every > 0 and actual_update_step % ckpt_every == 0:
+                #     booster.checkpoint_io._sync_io()
+                #     checkpoint_io._sync_io()
+                # == terminal timer ==
+                if cfg.get("record_time", False):
+                    print(timers.to_str(epoch, step))
+
+        sampler.reset()
+        start_step = 0
+    log_cuda_max_memory("final")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/vae/inference.py b/scripts/vae/inference.py
new file mode 100644
index 0000000..8130cb3
--- /dev/null
+++ b/scripts/vae/inference.py
@@ -0,0 +1,142 @@
+import os
+from pprint import pformat
+
+import colossalai
+import torch
+from colossalai.utils import get_current_device, set_seed
+from tqdm import tqdm
+
+from opensora.acceleration.parallel_states import get_data_parallel_group
+from opensora.datasets import save_sample
+from opensora.datasets.dataloader import prepare_dataloader
+from opensora.registry import DATASETS, MODELS, build_module
+from opensora.utils.config import parse_configs
+from opensora.utils.logger import create_logger, is_distributed, is_main_process
+from opensora.utils.misc import log_cuda_max_memory, log_model_params, to_torch_dtype
+
+
+@torch.inference_mode()
+def main():
+    torch.set_grad_enabled(False)
+    # ======================================================
+    # configs & runtime variables
+    # ======================================================
+    # == parse configs ==
+    cfg = parse_configs()
+
+    # == get dtype & device ==
+    dtype = to_torch_dtype(cfg.get("dtype", "fp32"))
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if is_distributed():
+        colossalai.launch_from_torch({})
+        device = get_current_device()
+    set_seed(cfg.get("seed", 1024))
+
+    # == init logger ==
+    logger = create_logger()
+    logger.info("Inference configuration:\n %s", pformat(cfg.to_dict()))
+    verbose = cfg.get("verbose", 1)
+
+    # ======================================================
+    # build model & loss
+    # ======================================================
+    if cfg.get("ckpt_path", None) is not None:
+        cfg.model.from_pretrained = cfg.ckpt_path
+    logger.info("Building models...")
+    model = build_module(cfg.model, MODELS, device_map=device, torch_dtype=dtype).eval()
+    log_model_params(model)
+
+    # ======================================================
+    # build dataset and dataloader
+    # ======================================================
+    logger.info("Building dataset...")
+    # == build dataset ==
+    dataset = build_module(cfg.dataset, DATASETS)
+    logger.info("Dataset contains %s samples.", len(dataset))
+    # == build dataloader ==
+    dataloader_args = dict(
+        dataset=dataset,
+        batch_size=cfg.get("batch_size", None),
+        num_workers=cfg.get("num_workers", 4),
+        seed=cfg.get("seed", 1024),
+        shuffle=False,
+        drop_last=False,
+        pin_memory=True,
+        process_group=get_data_parallel_group(),
+        prefetch_factor=cfg.get("prefetch_factor", None),
+    )
+
+    if cfg.get("eval_setting", None) is not None:
+        # e.g. 32x256, 1x1024
+        num_frames = int(cfg.eval_setting.split("x")[0])
+        resolution = str(cfg.eval_setting.split("x")[-1])
+        bucket_config = {
+            resolution + "px" + "_ar1:1": {num_frames: (1.0, 1)},
+        }
+        print("eval setting:\n", bucket_config)
+    else:
+        bucket_config = cfg.get("bucket_config", None)
+
+    dataloader, sampler = prepare_dataloader(
+        bucket_config=bucket_config,
+        num_bucket_build_workers=cfg.get("num_bucket_build_workers", 1),
+        **dataloader_args,
+    )
+    dataiter = iter(dataloader)
+    num_steps_per_epoch = len(dataloader)
+
+    # ======================================================
+    # inference
+    # ======================================================
+    # prepare arguments
+    save_fps = cfg.get("fps", 16) // cfg.get("frame_interval", 1)
+    save_dir = cfg.get("save_dir", None)
+    save_dir_orig = os.path.join(save_dir, "orig")
+    save_dir_recn = os.path.join(save_dir, "recn")
+    os.makedirs(save_dir_orig, exist_ok=True)
+    os.makedirs(save_dir_recn, exist_ok=True)
+
+    running_sum = running_var = 0.0
+    num_samples = 0
+
+    # Iter over the dataset
+    with tqdm(
+        enumerate(dataiter),
+        disable=not is_main_process() or verbose < 1,
+        total=num_steps_per_epoch,
+        initial=0,
+    ) as pbar:
+        for _, batch in pbar:
+            # == load data ==
+            x = batch["video"].to(device, dtype)  # [B, C, T, H, W]
+            path = batch["path"]
+
+            # == vae encoding & decoding ===
+            x_rec, posterior, z = model(x)
+
+            num_samples += 1
+            running_sum += z.mean()
+            running_var += (z - running_sum / num_samples).pow(2).mean()
+            if num_samples % 10 == 0:
+                logger.info(
+                    "VAE feature per channel stats: mean %s, var %s",
+                    (running_sum / num_samples).item(),
+                    (running_var / num_samples).sqrt().item(),
+                )
+
+            # == save samples ==
+            if is_main_process() and save_dir is not None:
+                for idx, x_orig in enumerate(x):
+                    fname = os.path.splitext(os.path.basename(path[idx]))[0]
+                    save_path_orig = os.path.join(save_dir_orig, f"{fname}_orig")
+                    save_sample(x_orig, save_path=save_path_orig, fps=save_fps)
+
+                    save_path_rec = os.path.join(save_dir_recn, f"{fname}_recn")
+                    save_sample(x_rec[idx], save_path=save_path_rec, fps=save_fps)
+
+    logger.info("Inference finished.")
+    log_cuda_max_memory("inference")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/vae/stats.py b/scripts/vae/stats.py
new file mode 100644
index 0000000..d87eacd
--- /dev/null
+++ b/scripts/vae/stats.py
@@ -0,0 +1,118 @@
+from pprint import pformat
+
+import colossalai
+import torch
+from colossalai.utils import get_current_device, set_seed
+from tqdm import tqdm
+
+from opensora.acceleration.parallel_states import get_data_parallel_group
+from opensora.datasets.dataloader import prepare_dataloader
+from opensora.registry import DATASETS, MODELS, build_module
+from opensora.utils.config import parse_configs
+from opensora.utils.logger import create_logger, is_distributed, is_main_process
+from opensora.utils.misc import log_cuda_max_memory, log_model_params, to_torch_dtype
+
+
+@torch.inference_mode()
+def main():
+    torch.set_grad_enabled(False)
+    # ======================================================
+    # configs & runtime variables
+    # ======================================================
+    # == parse configs ==
+    cfg = parse_configs()
+
+    # == get dtype & device ==
+    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if is_distributed():
+        colossalai.launch_from_torch({})
+        device = get_current_device()
+    set_seed(cfg.get("seed", 1024))
+
+    # == init logger ==
+    logger = create_logger()
+    logger.info("Inference configuration:\n %s", pformat(cfg.to_dict()))
+    verbose = cfg.get("verbose", 1)
+
+    # ======================================================
+    # build model & loss
+    # ======================================================
+    if cfg.get("ckpt_path", None) is not None:
+        cfg.model.from_pretrained = cfg.ckpt_path
+    logger.info("Building models...")
+    model = build_module(cfg.model, MODELS, device_map=device, torch_dtype=dtype).eval()
+    log_model_params(model)
+
+    # ======================================================
+    # build dataset and dataloader
+    # ======================================================
+    logger.info("Building dataset...")
+    # == build dataset ==
+    dataset = build_module(cfg.dataset, DATASETS)
+    logger.info("Dataset contains %s samples.", len(dataset))
+    # == build dataloader ==
+    dataloader_args = dict(
+        dataset=dataset,
+        batch_size=cfg.get("batch_size", None),
+        num_workers=cfg.get("num_workers", 4),
+        seed=cfg.get("seed", 1024),
+        shuffle=False,
+        drop_last=False,
+        pin_memory=True,
+        process_group=get_data_parallel_group(),
+        prefetch_factor=cfg.get("prefetch_factor", None),
+    )
+
+    if cfg.get("eval_setting", None) is not None:
+        # e.g. 32x256x256, 1x1024x1024
+        num_frames = int(cfg.eval_setting.split("x")[0])
+        resolution = str(cfg.eval_setting.split("x")[-1])
+        bucket_config = {
+            resolution + "px_ar1:1": {num_frames: (1.0, 1)},
+        }
+        print("eval setting:\n", bucket_config)
+    else:
+        bucket_config = cfg.get("bucket_config", None)
+
+    dataloader, _ = prepare_dataloader(
+        bucket_config=bucket_config,
+        num_bucket_build_workers=cfg.get("num_bucket_build_workers", 1),
+        **dataloader_args,
+    )
+    dataiter = iter(dataloader)
+    num_steps_per_epoch = len(dataloader)
+
+    # ======================================================
+    # inference
+    # ======================================================
+    num_samples = 0
+    running_sum = running_var = 0.0
+
+    # Iter over the dataset
+    with tqdm(
+        enumerate(dataiter),
+        disable=not is_main_process() or verbose < 1,
+        total=num_steps_per_epoch,
+        initial=0,
+    ) as pbar:
+        for _, batch in pbar:
+            # == load data ==
+            x = batch["video"].to(device, dtype)  # [B, C, T, H, W]
+
+            # == vae encoding & decoding ===
+            z = model.encode(x)
+
+            num_samples += 1
+            running_sum += z.mean().item()
+            running_var += (z - running_sum / num_samples).pow(2).mean().item()
+            shift = running_sum / num_samples
+            scale = (running_var / num_samples) ** 0.5
+            pbar.set_postfix({"mean": shift, "std": scale})
+
+    logger.info("Mean: %.4f, std: %.4f", shift, scale)
+    log_cuda_max_memory("inference")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/vae/train.py b/scripts/vae/train.py
new file mode 100644
index 0000000..9fbcc92
--- /dev/null
+++ b/scripts/vae/train.py
@@ -0,0 +1,597 @@
+import gc
+import os
+import random
+import subprocess
+import warnings
+from contextlib import nullcontext
+from copy import deepcopy
+from pprint import pformat
+
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+gc.disable()
+
+
+import torch
+import torch.distributed as dist
+from colossalai.booster import Booster
+from colossalai.utils import set_seed
+from torch.profiler import ProfilerActivity, profile, schedule
+from tqdm import tqdm
+
+import wandb
+from opensora.acceleration.checkpoint import set_grad_checkpoint
+from opensora.acceleration.parallel_states import get_data_parallel_group
+from opensora.datasets.dataloader import prepare_dataloader
+from opensora.datasets.pin_memory_cache import PinMemoryCache
+from opensora.models.vae.losses import DiscriminatorLoss, GeneratorLoss, VAELoss
+from opensora.registry import DATASETS, MODELS, build_module
+from opensora.utils.ckpt import CheckpointIO, model_sharding, record_model_param_shape, rm_checkpoints
+from opensora.utils.config import config_to_name, create_experiment_workspace, parse_configs
+from opensora.utils.logger import create_logger
+from opensora.utils.misc import (
+    Timer,
+    all_reduce_sum,
+    create_tensorboard_writer,
+    is_log_process,
+    log_model_params,
+    to_torch_dtype,
+)
+from opensora.utils.optimizer import create_lr_scheduler, create_optimizer
+from opensora.utils.train import create_colossalai_plugin, set_lr, set_warmup_steps, setup_device, update_ema
+
+torch.backends.cudnn.benchmark = True
+
+WAIT = 1
+WARMUP = 10
+ACTIVE = 20
+
+my_schedule = schedule(
+    wait=WAIT,  # number of warmup steps
+    warmup=WARMUP,  # number of warmup steps with profiling
+    active=ACTIVE,  # number of active steps with profiling
+)
+
+
+def main():
+    # ======================================================
+    # 1. configs & runtime variables
+    # ======================================================
+    # == parse configs ==
+    cfg = parse_configs()
+
+    # == get dtype & device ==
+    dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
+    device, coordinator = setup_device()
+    checkpoint_io = CheckpointIO()
+    set_seed(cfg.get("seed", 1024))
+    PinMemoryCache.force_dtype = dtype
+    pin_memory_cache_pre_alloc_numels = cfg.get("pin_memory_cache_pre_alloc_numels", None)
+    PinMemoryCache.pre_alloc_numels = pin_memory_cache_pre_alloc_numels
+
+    # == init ColossalAI booster ==
+    plugin_type = cfg.get("plugin", "zero2")
+    plugin_config = cfg.get("plugin_config", {})
+    plugin = (
+        create_colossalai_plugin(
+            plugin=plugin_type,
+            dtype=cfg.get("dtype", "bf16"),
+            grad_clip=cfg.get("grad_clip", 0),
+            **plugin_config,
+        )
+        if plugin_type != "none"
+        else None
+    )
+    booster = Booster(plugin=plugin)
+
+    # == init exp_dir ==
+    exp_name, exp_dir = create_experiment_workspace(
+        cfg.get("outputs", "./outputs"),
+        model_name=config_to_name(cfg),
+        config=cfg.to_dict(),
+    )
+    if is_log_process(plugin_type, plugin_config):
+        print(f"changing {exp_dir} to share")
+        os.system(f"chgrp -R share {exp_dir}")
+
+    # == init logger, tensorboard & wandb ==
+    logger = create_logger(exp_dir)
+    logger.info("Training configuration:\n %s", pformat(cfg.to_dict()))
+    tb_writer = None
+    if coordinator.is_master():
+        tb_writer = create_tensorboard_writer(exp_dir)
+        if cfg.get("wandb", False):
+            wandb.init(
+                project=cfg.get("wandb_project", "Open-Sora"),
+                name=cfg.get("wandb_expr_name", exp_name),
+                config=cfg.to_dict(),
+                dir=exp_dir,
+            )
+
+    # ======================================================
+    # 2. build dataset and dataloader
+    # ======================================================
+    logger.info("Building dataset...")
+    # == build dataset ==
+    dataset = build_module(cfg.dataset, DATASETS)
+    logger.info("Dataset contains %s samples.", len(dataset))
+
+    # == build dataloader ==
+    cache_pin_memory = pin_memory_cache_pre_alloc_numels is not None
+    dataloader_args = dict(
+        dataset=dataset,
+        batch_size=cfg.get("batch_size", None),
+        num_workers=cfg.get("num_workers", 4),
+        seed=cfg.get("seed", 1024),
+        shuffle=True,
+        drop_last=True,
+        pin_memory=True,
+        process_group=get_data_parallel_group(),
+        prefetch_factor=cfg.get("prefetch_factor", None),
+        cache_pin_memory=cache_pin_memory,
+    )
+    dataloader, sampler = prepare_dataloader(
+        bucket_config=cfg.get("bucket_config", None),
+        num_bucket_build_workers=cfg.get("num_bucket_build_workers", 1),
+        **dataloader_args,
+    )
+    num_steps_per_epoch = len(dataloader)
+
+    # ======================================================
+    # 3. build model
+    # ======================================================
+    logger.info("Building models...")
+
+    # == build vae model ==
+    model = build_module(cfg.model, MODELS, device_map=device, torch_dtype=dtype).train()
+    log_model_params(model)
+
+    if cfg.get("grad_checkpoint", False):
+        set_grad_checkpoint(model)
+    vae_loss_fn = VAELoss(**cfg.vae_loss_config, device=device, dtype=dtype)
+
+    # == build EMA model ==
+    if cfg.get("ema_decay", None) is not None:
+        ema = deepcopy(model).cpu().eval().requires_grad_(False)
+        ema_shape_dict = record_model_param_shape(ema)
+        logger.info("EMA model created.")
+    else:
+        ema = ema_shape_dict = None
+        logger.info("No EMA model created.")
+
+    # == build discriminator model ==
+    use_discriminator = cfg.get("discriminator", None) is not None
+    if use_discriminator:
+        discriminator = build_module(cfg.discriminator, MODELS).to(device, dtype).train()
+        log_model_params(discriminator)
+        generator_loss_fn = GeneratorLoss(**cfg.gen_loss_config)
+        discriminator_loss_fn = DiscriminatorLoss(**cfg.disc_loss_config)
+
+    # == setup optimizer ==
+    optimizer = create_optimizer(model, cfg.optim)
+
+    # == setup lr scheduler ==
+    lr_scheduler = create_lr_scheduler(
+        optimizer=optimizer, num_steps_per_epoch=num_steps_per_epoch, epochs=cfg.get("epochs", 1000), **cfg.lr_scheduler
+    )
+
+    # == setup discriminator optimizer ==
+    if use_discriminator:
+        disc_optimizer = create_optimizer(discriminator, cfg.optim_discriminator)
+        disc_lr_scheduler = create_lr_scheduler(
+            optimizer=disc_optimizer,
+            num_steps_per_epoch=num_steps_per_epoch,
+            epochs=cfg.get("epochs", 1000),
+            **cfg.disc_lr_scheduler,
+        )
+
+    # =======================================================
+    # 4. distributed training preparation with colossalai
+    # =======================================================
+    logger.info("Preparing for distributed training...")
+    # == boosting ==
+    torch.set_default_dtype(dtype)
+    model, optimizer, _, dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        dataloader=dataloader,
+    )
+
+    if use_discriminator:
+        discriminator, disc_optimizer, _, _, disc_lr_scheduler = booster.boost(
+            model=discriminator,
+            optimizer=disc_optimizer,
+            lr_scheduler=disc_lr_scheduler,
+        )
+    torch.set_default_dtype(torch.float)
+    logger.info("Boosted model for distributed training")
+
+    # == global variables ==
+    cfg_epochs = cfg.get("epochs", 1000)
+    mixed_strategy = cfg.get("mixed_strategy", None)
+    mixed_image_ratio = cfg.get("mixed_image_ratio", 0.0)
+    # modulate mixed image ratio since we force rank 0 to be video
+    num_ranks = dist.get_world_size()
+    modulated_mixed_image_ratio = (
+        num_ranks * mixed_image_ratio / (num_ranks - 1) if num_ranks > 1 else mixed_image_ratio
+    )
+    if is_log_process(plugin_type, plugin_config):
+        print("modulated mixed image ratio:", modulated_mixed_image_ratio)
+
+    start_epoch = start_step = log_step = acc_step = 0
+    running_loss = dict(  # loss accumulated over config.log_every steps
+        all=0.0,
+        nll=0.0,
+        nll_rec=0.0,
+        nll_per=0.0,
+        kl=0.0,
+        gen=0.0,
+        gen_w=0.0,
+        disc=0.0,
+        debug=0.0,
+    )
+
+    def log_loss(name, loss, loss_dict, use_video):
+        # only calculate loss for video
+        if use_video == 0:
+            loss.data = torch.tensor(0.0, device=device, dtype=dtype)
+        all_reduce_sum(loss.data)
+        num_video = torch.tensor(use_video, device=device, dtype=dtype)
+        all_reduce_sum(num_video)
+        loss_item = loss.item() / num_video.item()
+        loss_dict[name] = loss_item
+        running_loss[name] += loss_item
+
+    logger.info("Training for %s epochs with %s steps per epoch", cfg_epochs, num_steps_per_epoch)
+
+    # == resume ==
+    if cfg.get("load", None) is not None:
+        logger.info("Loading checkpoint from %s", cfg.load)
+        start_epoch = cfg.get("start_epoch", None)
+        start_step = cfg.get("start_step", None)
+        ret = checkpoint_io.load(
+            booster,
+            cfg.load,
+            model=model,
+            ema=ema,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            sampler=(
+                None if start_step is not None else sampler
+            ),  # if specify start step, set last_micro_batch_access_index of a new sampler instead
+        )
+        if start_step is not None:
+            # if start step exceeds data length, go to next epoch
+            if start_step > num_steps_per_epoch:
+                start_epoch = (
+                    start_epoch + start_step // num_steps_per_epoch
+                    if start_epoch is not None
+                    else start_step // num_steps_per_epoch
+                )
+                start_step = start_step % num_steps_per_epoch
+            sampler.set_step(start_step)
+
+        start_epoch = start_epoch if start_epoch is not None else ret[0]
+        start_step = start_step if start_step is not None else ret[1]
+
+        if (
+            use_discriminator
+            and os.path.exists(os.path.join(cfg.load, "discriminator"))
+            and not cfg.get("restart_disc", False)
+        ):
+            booster.load_model(discriminator, os.path.join(cfg.load, "discriminator"))
+            if cfg.get("load_optimizer", True):
+                booster.load_optimizer(disc_optimizer, os.path.join(cfg.load, "disc_optimizer"))
+                if disc_lr_scheduler is not None:
+                    booster.load_lr_scheduler(disc_lr_scheduler, os.path.join(cfg.load, "disc_lr_scheduler"))
+                if cfg.get("disc_lr", None) is not None:
+                    set_lr(disc_optimizer, disc_lr_scheduler, cfg.disc_lr)
+
+        logger.info("Loaded checkpoint %s at epoch %s step %s", cfg.load, start_epoch, start_step)
+
+        if cfg.get("lr", None) is not None:
+            set_lr(optimizer, lr_scheduler, cfg.lr, cfg.get("initial_lr", None))
+
+        if cfg.get("update_warmup_steps", False):
+            assert (
+                cfg.lr_scheduler.get("warmup_steps", None) is not None
+            ), "you need to set lr_scheduler.warmup_steps in order to pass --update-warmup-steps True"
+            set_warmup_steps(lr_scheduler, cfg.lr_scheduler.warmup_steps)
+            if use_discriminator:
+                assert (
+                    cfg.disc_lr_scheduler.get("warmup_steps", None) is not None
+                ), "you need to set disc_lr_scheduler.warmup_steps in order to pass --update-warmup-steps True"
+                set_warmup_steps(disc_lr_scheduler, cfg.disc_lr_scheduler.warmup_steps)
+
+    # == sharding EMA model ==
+    if ema is not None:
+        model_sharding(ema)
+        ema = ema.to(device)
+
+    if cfg.get("freeze_layers", None) == "all":
+        for param in model.module.parameters():
+            param.requires_grad = False
+        print("all layers frozen")
+
+    # model.module.requires_grad_(False)
+    # =======================================================
+    # 5. training loop
+    # =======================================================
+    dist.barrier()
+    accumulation_steps = int(cfg.get("accumulation_steps", 1))
+    for epoch in range(start_epoch, cfg_epochs):
+        # == set dataloader to new epoch ==
+        sampler.set_epoch(epoch)
+        dataiter = iter(dataloader)
+        logger.info("Beginning epoch %s...", epoch)
+        random.seed(1024 + dist.get_rank())  # load vid/img for each rank
+
+        # == training loop in an epoch ==
+        with tqdm(
+            enumerate(dataiter, start=start_step),
+            desc=f"Epoch {epoch}",
+            disable=not coordinator.is_master(),
+            total=num_steps_per_epoch,
+            initial=start_step,
+        ) as pbar:
+            pbar_iter = iter(pbar)
+
+            def fetch_data():
+                step, batch = next(pbar_iter)
+                pinned_video = batch["video"]
+                batch["video"] = pinned_video.to(device, dtype, non_blocking=True)
+                return batch, step, pinned_video
+
+            batch_, step_, pinned_video_ = fetch_data()
+
+            profiler_ctxt = (
+                profile(
+                    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                    schedule=my_schedule,
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/profile"),
+                    record_shapes=True,
+                    profile_memory=True,
+                    with_stack=True,
+                )
+                if cfg.get("profile", False)
+                else nullcontext()
+            )
+
+            with profiler_ctxt:
+                for _ in range(start_step, num_steps_per_epoch):
+                    if cfg.get("profile", False) and _ == WARMUP + ACTIVE + WAIT + 3:
+                        break
+
+                    # == load data ===
+                    batch, step, pinned_video = batch_, step_, pinned_video_
+                    if step + 1 < num_steps_per_epoch:
+                        batch_, step_, pinned_video_ = fetch_data()
+
+                    # == log config ==
+                    global_step = epoch * num_steps_per_epoch + step
+                    actual_update_step = (global_step + 1) // accumulation_steps
+                    log_step += 1
+                    acc_step += 1
+
+                    # == mixed strategy ==
+                    x = batch["video"]
+                    t_length = x.size(2)
+                    use_video = 1
+                    if mixed_strategy == "mixed_video_image":
+                        if random.random() < modulated_mixed_image_ratio and dist.get_rank() != 0:
+                            # NOTE: enable the first rank to use video
+                            t_length = 1
+                            use_video = 0
+                    elif mixed_strategy == "mixed_video_random":
+                        t_length = random.randint(1, x.size(2))
+                    x = x[:, :, :t_length, :, :]
+
+                    with Timer("model", log=True) if cfg.get("profile", False) else nullcontext():
+                        # == forward pass ==
+                        x_rec, posterior, z = model(x)
+
+                        if cfg.get("profile", False):
+                            profiler_ctxt.step()
+
+                        if cache_pin_memory:
+                            dataiter.remove_cache(pinned_video)
+
+                        # == loss initialization ==
+                        vae_loss = torch.tensor(0.0, device=device, dtype=dtype)
+                        loss_dict = {}  # loss at every step
+
+                        # == reconstruction loss ==
+                        ret = vae_loss_fn(x, x_rec, posterior)
+                        nll_loss = ret["nll_loss"]
+                        kl_loss = ret["kl_loss"]
+                        recon_loss = ret["recon_loss"]
+                        perceptual_loss = ret["perceptual_loss"]
+                        vae_loss += nll_loss + kl_loss
+
+                        # == generator loss ==
+                        if use_discriminator:
+                            # turn off grad update for disc
+                            discriminator.requires_grad_(False)
+                            fake_logits = discriminator(x_rec.contiguous())
+
+                            generator_loss, g_loss = generator_loss_fn(
+                                fake_logits,
+                                nll_loss,
+                                model.module.get_last_layer(),
+                                actual_update_step,
+                                is_training=model.training,
+                            )
+                            # print(f"generator_loss: {generator_loss}, recon_loss: {recon_loss}, perceptual_loss: {perceptual_loss}")
+
+                            vae_loss += generator_loss
+                            # turn on disc training
+                            discriminator.requires_grad_(True)
+
+                        # == generator backward & update ==
+                        ctx = (
+                            booster.no_sync(model, optimizer)
+                            if cfg.get("plugin", "zero2") in ("zero1", "zero1-seq")
+                            and (step + 1) % accumulation_steps != 0
+                            else nullcontext()
+                        )
+                        with Timer("backward", log=True) if cfg.get("profile", False) else nullcontext():
+                            with ctx:
+                                booster.backward(loss=vae_loss / accumulation_steps, optimizer=optimizer)
+
+                        with Timer("optimizer", log=True) if cfg.get("profile", False) else nullcontext():
+                            if (step + 1) % accumulation_steps == 0:
+                                optimizer.step()
+                                optimizer.zero_grad()
+                                if lr_scheduler is not None:
+                                    lr_scheduler.step(
+                                        actual_update_step,
+                                    )
+                                # == update EMA ==
+                                if ema is not None:
+                                    update_ema(
+                                        ema,
+                                        model.unwrap(),
+                                        optimizer=optimizer,
+                                        decay=cfg.get("ema_decay", 0.9999),
+                                    )
+
+                    # == logging ==
+                    log_loss("all", vae_loss, loss_dict, use_video)
+                    log_loss("nll", nll_loss, loss_dict, use_video)
+                    log_loss("nll_rec", recon_loss, loss_dict, use_video)
+                    log_loss("nll_per", perceptual_loss, loss_dict, use_video)
+                    log_loss("kl", kl_loss, loss_dict, use_video)
+                    if use_discriminator:
+                        log_loss("gen_w", generator_loss, loss_dict, use_video)
+                        log_loss("gen", g_loss, loss_dict, use_video)
+
+                    # == loss: discriminator adversarial ==
+                    if use_discriminator:
+                        real_logits = discriminator(x.detach().contiguous())
+                        fake_logits = discriminator(x_rec.detach().contiguous())
+                        disc_loss = discriminator_loss_fn(
+                            real_logits,
+                            fake_logits,
+                            actual_update_step,
+                        )
+
+                        # == discriminator backward & update ==
+                        ctx = (
+                            booster.no_sync(discriminator, disc_optimizer)
+                            if cfg.get("plugin", "zero2") in ("zero1", "zero1-seq")
+                            and (step + 1) % accumulation_steps != 0
+                            else nullcontext()
+                        )
+                        with ctx:
+                            booster.backward(loss=disc_loss / accumulation_steps, optimizer=disc_optimizer)
+                        if (step + 1) % accumulation_steps == 0:
+                            disc_optimizer.step()
+                            disc_optimizer.zero_grad()
+                            if disc_lr_scheduler is not None:
+                                disc_lr_scheduler.step(actual_update_step)
+
+                        # log
+                        log_loss("disc", disc_loss, loss_dict, use_video)
+
+                    # == logging ==
+                    if (global_step + 1) % accumulation_steps == 0:
+                        if coordinator.is_master() and actual_update_step % cfg.get("log_every", 1) == 0:
+                            avg_loss = {k: v / log_step for k, v in running_loss.items()}
+                            # progress bar
+                            pbar.set_postfix(
+                                {
+                                    # "step": step,
+                                    # "global_step": global_step,
+                                    # "actual_update_step": actual_update_step,
+                                    # "lr": optimizer.param_groups[0]["lr"],
+                                    **{k: f"{v:.2f}" for k, v in avg_loss.items()},
+                                }
+                            )
+                            # tensorboard
+                            tb_writer.add_scalar("loss", vae_loss.item(), actual_update_step)
+                            # wandb
+                            if cfg.get("wandb", False):
+                                wandb.log(
+                                    {
+                                        "iter": global_step,
+                                        "epoch": epoch,
+                                        "lr": optimizer.param_groups[0]["lr"],
+                                        "avg_loss_": avg_loss,
+                                        "avg_loss": avg_loss["all"],
+                                        "loss_": loss_dict,
+                                        "loss": vae_loss.item(),
+                                        "global_grad_norm": optimizer.get_grad_norm(),
+                                    },
+                                    step=actual_update_step,
+                                )
+
+                            running_loss = {k: 0.0 for k in running_loss}
+                            log_step = 0
+
+                        # == checkpoint saving ==
+                        ckpt_every = cfg.get("ckpt_every", 0)
+                        if ckpt_every > 0 and actual_update_step % ckpt_every == 0 and coordinator.is_master():
+                            subprocess.run("sudo drop_cache", shell=True)
+
+                        if ckpt_every > 0 and actual_update_step % ckpt_every == 0:
+                            # mannually garbage collection
+                            gc.collect()
+
+                            save_dir = checkpoint_io.save(
+                                booster,
+                                exp_dir,
+                                model=model,
+                                ema=ema,
+                                optimizer=optimizer,
+                                lr_scheduler=lr_scheduler,
+                                sampler=sampler,
+                                epoch=epoch,
+                                step=step + 1,
+                                global_step=global_step + 1,
+                                batch_size=cfg.get("batch_size", None),
+                                actual_update_step=actual_update_step,
+                                ema_shape_dict=ema_shape_dict,
+                                async_io=True,
+                            )
+
+                            if is_log_process(plugin_type, plugin_config):
+                                os.system(f"chgrp -R share {save_dir}")
+
+                            if use_discriminator:
+                                booster.save_model(discriminator, os.path.join(save_dir, "discriminator"), shard=True)
+                                booster.save_optimizer(
+                                    disc_optimizer,
+                                    os.path.join(save_dir, "disc_optimizer"),
+                                    shard=True,
+                                    size_per_shard=4096,
+                                )
+                                if disc_lr_scheduler is not None:
+                                    booster.save_lr_scheduler(
+                                        disc_lr_scheduler, os.path.join(save_dir, "disc_lr_scheduler")
+                                    )
+                            dist.barrier()
+
+                            logger.info(
+                                "Saved checkpoint at epoch %s, step %s, global_step %s to %s",
+                                epoch,
+                                step + 1,
+                                actual_update_step,
+                                save_dir,
+                            )
+
+                            # remove old checkpoints
+                            rm_checkpoints(exp_dir, keep_n_latest=cfg.get("keep_n_latest", -1))
+                            logger.info(
+                                "Removed old checkpoints and kept %s latest ones.", cfg.get("keep_n_latest", -1)
+                            )
+
+            if cfg.get("profile", False):
+                profiler_ctxt.export_chrome_trace("./log/profile/trace.json")
+
+        sampler.reset()
+        start_step = 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..4a0ec1f
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,76 @@
+from typing import List
+
+from setuptools import find_packages, setup
+
+
+def fetch_requirements(paths) -> List[str]:
+    """
+    This function reads the requirements file.
+
+    Args:
+        path (str): the path to the requirements file.
+
+    Returns:
+        The lines in the requirements file.
+    """
+    if not isinstance(paths, list):
+        paths = [paths]
+    requirements = []
+    for path in paths:
+        with open(path, "r") as fd:
+            requirements += [r.strip() for r in fd.readlines()]
+    return requirements
+
+
+def fetch_readme() -> str:
+    """
+    This function reads the README.md file in the current directory.
+
+    Returns:
+        The lines in the README file.
+    """
+    with open("README.md", encoding="utf-8") as f:
+        return f.read()
+
+
+setup(
+    name="opensora",
+    version="2.0.0",
+    packages=find_packages(
+        exclude=(
+            "assets",
+            "configs",
+            "docs",
+            "eval",
+            "evaluation_results",
+            "gradio",
+            "logs",
+            "notebooks",
+            "outputs",
+            "pretrained_models",
+            "samples",
+            "scripts",
+            "*.egg-info",
+        )
+    ),
+    description="Democratizing Efficient Video Production for All",
+    long_description=fetch_readme(),
+    long_description_content_type="text/markdown",
+    license="Apache Software License 2.0",
+    url="https://github.com/hpcaitech/Open-Sora",
+    project_urls={
+        "Bug Tracker": "https://github.com/hpcaitech/Open-Sora/issues",
+        "Examples": "https://hpcaitech.github.io/Open-Sora/",
+        "Documentation": "https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file",
+        "Github": "https://github.com/hpcaitech/Open-Sora",
+    },
+    install_requires=fetch_requirements("requirements.txt"),
+    python_requires=">=3.6",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: System :: Distributed Computing",
+    ],
+)
diff --git a/tests/test_attn.py b/tests/test_attn.py
new file mode 100644
index 0000000..930237a
--- /dev/null
+++ b/tests/test_attn.py
@@ -0,0 +1,217 @@
+from itertools import product
+
+import pytest
+import torch
+from colossalai.accelerator import get_accelerator
+from colossalai.utils import get_current_device
+
+from opensora.models.layers.blocks import Attention, split_batch_cat_seq, split_seq_cat_batch
+from opensora.models.layers.rotary_embedding_torch import RotaryEmbedding
+
+# B, S, H = 7488, 1, 1152
+# B, S, H = 32, 234, 1152
+B, S, H = 128, 32, 1152
+N, D = 16, 72
+
+
+def run_attn(enable_flash_attn: bool):
+    get_accelerator().reset_peak_memory_stats()
+    rope = RotaryEmbedding(D).to(device=get_current_device(), dtype=torch.bfloat16)
+    attn = Attention(
+        H,
+        N,
+        qkv_bias=True,
+        rope=rope.rotate_queries_or_keys,
+        enable_flash_attn=enable_flash_attn,
+    ).to(device=get_current_device(), dtype=torch.bfloat16)
+    x = torch.randn(B, S, H, device=get_current_device(), dtype=torch.bfloat16).requires_grad_()
+    y = attn(x)
+    y.mean().backward()
+    print(f"Peak memory: {get_accelerator().max_memory_allocated() / 1024**2:.2f} MB")
+
+
+def test_block_transform():
+    b, h, w, c = 8, 12, 4, 3
+    x = torch.randn(b, h, w, c)
+    kernel_sizes = (3, 2)
+    dims = (1, 2)
+    num_splits = [x.size(d) // k for d, k in zip(dims, kernel_sizes)]
+    y = split_seq_cat_batch(x, kernel_sizes, dims)
+    z = split_batch_cat_seq(y, b, num_splits, dims)
+    assert torch.equal(x, z)
+
+
+@pytest.mark.parametrize(
+    "shape, kernel_sizes",
+    [
+        [(8, 12, 4, 1), (2, 2, -1)],  # divisible + N<B + 3D
+        [(1, 5, 2, 5), (2, 2, -1)],  # undivisible + N>B + 3D
+    ],
+)
+@pytest.mark.parametrize("shift_window", [False, True])
+def test_block_attn_nd(shape, kernel_sizes, shift_window):
+    hidden_size = 96
+    num_heads = 4
+    head_dim = hidden_size // num_heads
+    rope = RotaryEmbedding(head_dim // 3).to(device=get_current_device(), dtype=torch.bfloat16)
+    attn = Attention(
+        hidden_size,
+        num_heads,
+        qkv_bias=True,
+        qk_norm=True,
+        enable_flash_attn=True,
+        rope=rope.rotate_queries_or_keys,
+        kernel_size=kernel_sizes,
+        shift_window=shift_window,
+    ).to(device=get_current_device(), dtype=torch.bfloat16)
+    # [B, H, W, C]
+    x = torch.rand(*shape, hidden_size, device=get_current_device(), dtype=torch.bfloat16).requires_grad_()
+    y = attn(x)
+    assert x.shape == y.shape
+    loss = y.mean()
+    loss.backward()
+
+
+@pytest.mark.parametrize(
+    "shape, kernel_sizes",
+    [
+        [(8, 12, 4, 1), (2, 2, -1)],  # divisible + N<B + 3D
+        [(8, 12, 4, 6), (2, 2, -1)],  # divisible + N<B + 3D
+        [(8, 12, 3, 6), (2, 2, -1)],  # divisible + N<B + 3D
+        [(8, 90, 60, 13), (8, 8, -1)],  # 480p video
+        [(8, 160, 90, 1), (8, 8, -1)],  # 720p image
+    ],
+)
+def test_block_attn_3d(shape, kernel_sizes):
+    hidden_size = 96
+    num_heads = 4
+    head_dim = hidden_size // num_heads
+    rope = RotaryEmbedding(head_dim // 3).to(device=get_current_device(), dtype=torch.bfloat16)
+    attn = Attention(
+        hidden_size,
+        num_heads,
+        qkv_bias=True,
+        qk_norm=True,
+        enable_flash_attn=False,
+        rope=rope.rotate_queries_or_keys,
+        kernel_size=kernel_sizes,
+    ).to(device=get_current_device(), dtype=torch.bfloat16)
+    # [B, H, W, T, C]
+    x = torch.rand(*shape, hidden_size, device=get_current_device(), dtype=torch.bfloat16)
+    y = attn(x)
+
+    split_size = [k if k > 0 else x.size(i + 1) for i, k in enumerate(kernel_sizes)]
+    for start_indices in product(*[range(0, x.size(i + 1), s) for i, s in enumerate(split_size)]):
+        piece = x[
+            :,
+            start_indices[0] : start_indices[0] + split_size[0],
+            start_indices[1] : start_indices[1] + split_size[1],
+            start_indices[2] : start_indices[2] + split_size[2],
+            :,
+        ]
+        piece_z = attn(piece)
+        piece_y = y[
+            :,
+            start_indices[0] : start_indices[0] + split_size[0],
+            start_indices[1] : start_indices[1] + split_size[1],
+            start_indices[2] : start_indices[2] + split_size[2],
+            :,
+        ]
+        assert piece_y.shape == piece_z.shape
+        assert torch.equal(
+            piece_z,
+            piece_y,
+        )
+
+
+@pytest.mark.parametrize(
+    "shape, kernel_sizes, kernel_sizes2",
+    [
+        [(2, 80, 60, 1), (8, 8, 4), (8, 8, -1)],  # 720p image
+        [(2, 4, 4, 6), (4, 4, -1), (8, 8, -1)],  # 720p image
+    ],
+)
+def test_block_attn_3d_var_kernel(shape, kernel_sizes, kernel_sizes2):
+    hidden_size = 24
+    num_heads = 2
+    head_dim = hidden_size // num_heads
+    rope = RotaryEmbedding(head_dim // 3).to(device=get_current_device(), dtype=torch.bfloat16)
+    attn = Attention(
+        hidden_size,
+        num_heads,
+        qkv_bias=True,
+        qk_norm=True,
+        enable_flash_attn=False,
+        rope=rope.rotate_queries_or_keys,
+        kernel_size=kernel_sizes,
+    ).to(device=get_current_device(), dtype=torch.bfloat16)
+    # [B, H, W, T, C]
+    x = torch.rand(*shape, hidden_size, device=get_current_device(), dtype=torch.bfloat16)
+    y = attn(x)
+    attn2 = Attention(
+        hidden_size,
+        num_heads,
+        qkv_bias=True,
+        qk_norm=True,
+        enable_flash_attn=False,
+        rope=rope.rotate_queries_or_keys,
+        kernel_size=kernel_sizes2,
+    ).to(device=get_current_device(), dtype=torch.bfloat16)
+    attn2.load_state_dict(attn.state_dict())
+    y2 = attn2(x)
+    assert y.shape == y2.shape
+    torch.testing.assert_close(y, y2)
+
+
+def test_block_attn_3d_overlap():
+    kernel_sizes = (8, 8, -1)
+    hidden_size = 24
+    num_heads = 2
+    head_dim = hidden_size // num_heads
+    rope = RotaryEmbedding(head_dim // 3).to(device=get_current_device(), dtype=torch.bfloat16)
+    attn = Attention(
+        hidden_size,
+        num_heads,
+        qkv_bias=True,
+        qk_norm=True,
+        enable_flash_attn=False,
+        rope=rope.rotate_queries_or_keys,
+        kernel_size=kernel_sizes,
+    ).to(device=get_current_device(), dtype=torch.bfloat16)
+    # [B, H, W, T, C]
+    x = torch.rand(2, 40, 40, 6, hidden_size, device=get_current_device(), dtype=torch.bfloat16)
+    y = attn(x)
+    x2 = torch.rand(2, 48, 48, 6, hidden_size, device=get_current_device(), dtype=torch.bfloat16)
+    x2[:, :40, :40] = x
+    y2 = attn(x2)
+    torch.testing.assert_close(y, y2[:, :40, :40])
+
+
+def test_block_transform_3d():
+    b, h, w, t, c = 8, 12, 4, 6, 3
+    x = torch.randn(b, h, w, t, c)
+    kernel_sizes = (3, 2, 6)
+    dims = (1, 2, 3)
+    num_splits = [x.size(d) // k for d, k in zip(dims, kernel_sizes)]
+    y = split_seq_cat_batch(x, kernel_sizes, dims)
+    split_size = [k if k > 0 else x.size(i + 1) for i, k in enumerate(kernel_sizes)]
+    for i, start_indices in enumerate(product(*[range(0, x.size(i + 1), s) for i, s in enumerate(split_size)])):
+        piece = x[
+            :,
+            start_indices[0] : start_indices[0] + split_size[0],
+            start_indices[1] : start_indices[1] + split_size[1],
+            start_indices[2] : start_indices[2] + split_size[2],
+            :,
+        ]
+        y_piece = y[i * b : (i + 1) * b]
+        assert torch.equal(y_piece, piece), f"{y_piece.shape} vs {piece.shape}"
+
+    z = split_batch_cat_seq(y, b, num_splits, dims)
+    assert torch.equal(x, z)
+
+
+if __name__ == "__main__":
+    print("Use flashattn")
+    run_attn(True)
+    print("No flashattn")
+    run_attn(False)
diff --git a/tests/test_lr_scheduler.py b/tests/test_lr_scheduler.py
new file mode 100644
index 0000000..8a3fd31
--- /dev/null
+++ b/tests/test_lr_scheduler.py
@@ -0,0 +1,31 @@
+import torch
+from torch.optim import Adam
+from torchvision.models import resnet50
+from tqdm import tqdm
+
+from opensora.utils.lr_scheduler import LinearWarmupLR
+
+
+def test_lr_scheduler():
+    warmup_steps = 200
+    model = resnet50().cuda()
+    optimizer = Adam(model.parameters(), lr=0.01)
+    scheduler = LinearWarmupLR(optimizer, warmup_steps=warmup_steps)
+    current_lr = scheduler.get_lr()[0]
+    data = torch.rand(1, 3, 224, 224).cuda()
+
+    for i in tqdm(range(warmup_steps * 2)):
+        out = model(data)
+        out.mean().backward()
+        optimizer.step()
+        scheduler.step()
+
+        if i >= warmup_steps:
+            assert scheduler.get_lr()[0] == 0.01
+        else:
+            assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}"
+            current_lr = scheduler.get_lr()[0]
+
+
+if __name__ == "__main__":
+    test_lr_scheduler()
diff --git a/tests/test_np_torch.py b/tests/test_np_torch.py
new file mode 100644
index 0000000..bc7fdb0
--- /dev/null
+++ b/tests/test_np_torch.py
@@ -0,0 +1,346 @@
+from typing import Callable
+
+import numpy as np
+import torch
+
+
+# ==================================
+# Warm Up Beta
+# ==================================
+def _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+
+
+def _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=torch.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = torch.linspace(beta_start, beta_end, warmup_time, dtype=torch.float64)
+    return betas
+
+
+def test_warmup_beta():
+    beta_start = 1e-6
+    beta_end = 0.99
+    num_diffusion_timesteps = 1000
+    warmup_frac = 0.1
+    betas_np = _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, warmup_frac)
+    betas_torch = _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, warmup_frac)
+    assert np.allclose(betas_np, betas_torch.numpy())
+    print("Test passed for warmup_beta()")
+
+
+# ==================================
+# Beta Schedule
+# ==================================
+
+
+def get_beta_schedule_numpy(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta_numpy(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64)
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+
+
+def get_beta_schedule_torch(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = torch.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=torch.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta_torch(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * torch.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / torch.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=torch.float64)
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+
+
+def test_get_beta_Schedule():
+    beta_start = 1e-6
+    beta_end = 0.99
+    num_diffusion_timesteps = 1000
+    beta_schedule = "linear"
+    betas_np = get_beta_schedule_numpy(
+        beta_schedule, beta_start=beta_start, beta_end=beta_end, num_diffusion_timesteps=num_diffusion_timesteps
+    )
+    betas_torch = get_beta_schedule_torch(
+        beta_schedule, beta_start=beta_start, beta_end=beta_end, num_diffusion_timesteps=num_diffusion_timesteps
+    )
+    assert np.allclose(betas_np, betas_torch.numpy())
+    print("Test passed for get_beta_schedule()")
+
+
+# ====================
+# Replace alpha
+# ====================
+def betas_for_alpha_bar_numpy(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+
+
+def betas_for_alpha_bar_torch(num_diffusion_timesteps: int, alpha_bar: Callable, max_beta: float = 0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return torch.DoubleTensor(betas)
+
+
+def test_betas_for_alpha_bar():
+    num_diffusion_timesteps = 1000
+    alpha_bar = lambda t: 1 - t
+    max_beta = 0.999
+    betas_np = betas_for_alpha_bar_numpy(num_diffusion_timesteps, alpha_bar, max_beta)
+    betas_torch = betas_for_alpha_bar_torch(num_diffusion_timesteps, alpha_bar, max_beta)
+    assert np.allclose(betas_np, betas_torch.numpy())
+    print("Test passed for betas_for_alpha_bar()")
+
+
+# =======================
+# Gaussian init
+# =======================
+def init_numpy(betas):
+    # Use float64 for accuracy.
+    betas = torch.DoubleTensor(betas)
+    assert len(betas.shape) == 1, "betas must be 1-D"
+    assert (betas > 0).all() and (betas <= 1).all()
+
+    num_timesteps = int(betas.shape[0])
+
+    alphas = 1.0 - betas
+    alphas_cumprod = np.cumprod(alphas, axis=0)
+    alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+    alphas_cumprod_next = np.append(alphas_cumprod[1:], 0.0)
+    assert alphas_cumprod_prev.shape == (num_timesteps,)
+
+    # calculations for diffusion q(x_t | x_{t-1}) and others
+    np.sqrt(alphas_cumprod)
+    np.sqrt(1.0 - alphas_cumprod)
+    np.log(1.0 - alphas_cumprod)
+    np.sqrt(1.0 / alphas_cumprod)
+    np.sqrt(1.0 / alphas_cumprod - 1)
+
+    # calculations for posterior q(x_{t-1} | x_t, x_0)
+    posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+    # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+    posterior_log_variance_clipped = (
+        np.log(np.append(posterior_variance[1], posterior_variance[1:]))
+        if len(posterior_variance) > 1
+        else np.array([])
+    )
+
+    posterior_mean_coef1 = betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+    posterior_mean_coef2 = (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)
+
+    return alphas_cumprod_prev, alphas_cumprod_next, posterior_mean_coef1, posterior_mean_coef2
+
+
+def gaussian_init_numpy(betas):
+    # Use float64 for accuracy.
+    betas = np.array(betas, dtype=np.float64)
+    assert len(betas.shape) == 1, "betas must be 1-D"
+    assert (betas > 0).all() and (betas <= 1).all()
+
+    num_timesteps = int(betas.shape[0])
+
+    alphas = 1.0 - betas
+    alphas_cumprod = np.cumprod(alphas, axis=0)
+    alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+    alphas_cumprod_next = np.append(alphas_cumprod[1:], 0.0)
+    assert alphas_cumprod_prev.shape == (num_timesteps,)
+
+    # calculations for diffusion q(x_t | x_{t-1}) and others
+    sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)
+    sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - alphas_cumprod)
+    log_one_minus_alphas_cumprod = np.log(1.0 - alphas_cumprod)
+    sqrt_recip_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod)
+    sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod - 1)
+
+    # calculations for posterior q(x_{t-1} | x_t, x_0)
+    posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+    # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+    posterior_log_variance_clipped = (
+        np.log(np.append(posterior_variance[1], posterior_variance[1:]))
+        if len(posterior_variance) > 1
+        else np.array([])
+    )
+
+    posterior_mean_coef1 = betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+    posterior_mean_coef2 = (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)
+
+    return (
+        alphas_cumprod_prev,
+        alphas_cumprod_next,
+        sqrt_alphas_cumprod,
+        sqrt_one_minus_alphas_cumprod,
+        log_one_minus_alphas_cumprod,
+        sqrt_recip_alphas_cumprod,
+        sqrt_recipm1_alphas_cumprod,
+        posterior_log_variance_clipped,
+        posterior_mean_coef1,
+        posterior_mean_coef2,
+    )
+
+
+def gaussian_init_torch(betas):
+    # Use float64 for accuracy.
+    betas = torch.DoubleTensor(betas)
+    assert len(betas.shape) == 1, "betas must be 1-D"
+    assert (betas > 0).all() and (betas <= 1).all()
+
+    num_timesteps = int(betas.shape[0])
+
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, axis=0)
+    alphas_cumprod_prev = torch.cat([torch.tensor([1.0]), alphas_cumprod[:-1]])
+    alphas_cumprod_next = torch.cat([alphas_cumprod[1:], torch.tensor([0.0])])
+    assert alphas_cumprod_prev.shape == (num_timesteps,)
+
+    # calculations for diffusion q(x_t | x_{t-1}) and others
+    sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+    sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+    log_one_minus_alphas_cumprod = torch.log(1.0 - alphas_cumprod)
+    sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod)
+    sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod - 1)
+
+    # calculations for posterior q(x_{t-1} | x_t, x_0)
+    posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+    # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+
+    posterior_log_variance_clipped = (
+        torch.log(torch.cat([posterior_variance[1].unsqueeze(0), posterior_variance[1:]]))
+        if len(posterior_variance) > 1
+        else torch.array([])
+    )
+
+    posterior_mean_coef1 = betas * torch.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+    posterior_mean_coef2 = (1.0 - alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - alphas_cumprod)
+
+    return (
+        alphas_cumprod_prev,
+        alphas_cumprod_next,
+        sqrt_alphas_cumprod,
+        sqrt_one_minus_alphas_cumprod,
+        log_one_minus_alphas_cumprod,
+        sqrt_recip_alphas_cumprod,
+        sqrt_recipm1_alphas_cumprod,
+        posterior_log_variance_clipped,
+        posterior_mean_coef1,
+        posterior_mean_coef2,
+    )
+
+
+def test_gaussian_init():
+    betas = np.linspace(1e-6, 0.99, 1000)
+    (
+        alphas_cumprod_prev,
+        alphas_cumprod_next,
+        sqrt_alphas_cumprod,
+        sqrt_one_minus_alphas_cumprod,
+        log_one_minus_alphas_cumprod,
+        sqrt_recip_alphas_cumprod,
+        sqrt_recipm1_alphas_cumprod,
+        posterior_log_variance_clipped,
+        posterior_mean_coef1,
+        posterior_mean_coef2,
+    ) = gaussian_init_numpy(betas)
+    (
+        alphas_cumprod_prev_t,
+        alphas_cumprod_next_t,
+        sqrt_alphas_cumprod_t,
+        sqrt_one_minus_alphas_cumprod_t,
+        log_one_minus_alphas_cumprod_t,
+        sqrt_recip_alphas_cumprod_t,
+        sqrt_recipm1_alphas_cumprod_t,
+        posterior_log_variance_clipped_t,
+        posterior_mean_coef1_t,
+        posterior_mean_coef2_t,
+    ) = gaussian_init_torch(betas)
+
+    assert np.allclose(alphas_cumprod_prev, alphas_cumprod_prev_t.numpy())
+    assert np.allclose(alphas_cumprod_next, alphas_cumprod_next_t.numpy())
+    assert np.allclose(sqrt_alphas_cumprod, sqrt_alphas_cumprod_t.numpy())
+    assert np.allclose(sqrt_one_minus_alphas_cumprod, sqrt_one_minus_alphas_cumprod_t.numpy())
+    assert np.allclose(log_one_minus_alphas_cumprod, log_one_minus_alphas_cumprod_t.numpy())
+    assert np.allclose(sqrt_recip_alphas_cumprod, sqrt_recip_alphas_cumprod_t.numpy())
+    assert np.allclose(sqrt_recipm1_alphas_cumprod, sqrt_recipm1_alphas_cumprod_t.numpy())
+    assert np.allclose(posterior_log_variance_clipped, posterior_log_variance_clipped_t.numpy())
+    assert np.allclose(posterior_mean_coef1, posterior_mean_coef1_t.numpy())
+    assert np.allclose(posterior_mean_coef2, posterior_mean_coef2_t.numpy())
+    print("Test passed for gaussian_init()")
+
+
+if __name__ == "__main__":
+    test_warmup_beta()
+    test_get_beta_Schedule()
+    test_betas_for_alpha_bar()
+    test_gaussian_init()
diff --git a/tests/test_pos_emb.py b/tests/test_pos_emb.py
new file mode 100644
index 0000000..c409acb
--- /dev/null
+++ b/tests/test_pos_emb.py
@@ -0,0 +1,45 @@
+import pytest
+import torch
+
+from opensora.models.layers.blocks import PositionEmbedding2D, get_2d_sincos_pos_embed
+
+D = 8
+SCALE = 2.0
+from torch.testing import assert_close
+
+
+def get_spatial_pos_embed(x, hidden_size, h, w, scale, base_size=None):
+    pos_embed = get_2d_sincos_pos_embed(
+        hidden_size,
+        (h, w),
+        scale=scale,
+        base_size=base_size,
+    )
+    pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False)
+    return pos_embed.to(device=x.device, dtype=x.dtype)
+
+
+@pytest.mark.parametrize("dtype", [torch.float, torch.float16])
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_pos_emb(dtype, device):
+    # just a placeholder to get the device and dtype
+    x = torch.empty(1, dtype=dtype, device=device)
+    pos_embedder = PositionEmbedding2D(
+        D,
+        max_position_embeddings=8,
+        scale=SCALE,
+    ).to(device=device, dtype=dtype)
+    output = pos_embedder(x, 8, 7)
+    target = get_spatial_pos_embed(x, D, 8, 7, SCALE)
+    assert_close(output, target)
+    output = pos_embedder(x, 15, 16)
+    target = get_spatial_pos_embed(x, D, 15, 16, SCALE)
+    assert_close(output, target)
+    output = pos_embedder(x, 30, 20, base_size=2)
+    target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2)
+    assert_close(output, target)
+    # test cache
+    output = pos_embedder(x, 30, 20, base_size=2)
+    target = get_spatial_pos_embed(x, D, 30, 20, SCALE, base_size=2)
+    assert_close(output, target)
+    assert pos_embedder._get_cached_emb.cache_info().hits >= 1
diff --git a/tests/test_seq_parallel_attention.py b/tests/test_seq_parallel_attention.py
new file mode 100644
index 0000000..1a06eba
--- /dev/null
+++ b/tests/test_seq_parallel_attention.py
@@ -0,0 +1,189 @@
+import colossalai
+import torch
+import torch.distributed as dist
+from colossalai.testing import spawn
+
+from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
+from opensora.acceleration.parallel_states import set_sequence_parallel_group
+from opensora.models.layers.blocks import (
+    Attention,
+    MultiHeadCrossAttention,
+    SeqParallelAttention,
+    SeqParallelMultiHeadCrossAttention,
+)
+
+
+def run_attention(rank, world_size, kernel_size=None, temporal=False):
+    # create model
+    torch.manual_seed(1024)
+    set_sequence_parallel_group(dist.group.WORLD)
+
+    seq_parallel_attention = SeqParallelAttention(
+        dim=1152, num_heads=16, qkv_bias=True, enable_flash_attn=False, kernel_size=kernel_size, temporal=temporal
+    ).cuda()
+    # seq_parallel_attention = SeqParallelAttention(dim=256, num_heads=4, qkv_bias=True, enable_flash_attn=False, kernel_size=kernel_size).cuda()
+
+    torch.manual_seed(1024)
+    attention = Attention(
+        dim=1152,
+        num_heads=16,
+        qkv_bias=True,
+        enable_flash_attn=False,
+        kernel_size=kernel_size,
+    ).cuda()
+
+    # create inputs
+    torch.manual_seed(1024)
+    # x = torch.randn(4, 64, 256).cuda()
+    if kernel_size:
+        x = torch.randn(3, 12, 40, 30, 1152).cuda()
+    else:
+        x = torch.randn(3, 16 * 8, 1152).cuda()
+    seq_x = x.clone().detach()
+
+    x.requires_grad = True
+    x.retain_grad()
+    seq_x.requires_grad = True
+    seq_x.retain_grad()
+
+    if kernel_size is None and temporal is True:
+        from einops import rearrange
+
+        seq_x_ = rearrange(seq_x, "B (T S) C -> B T S C", T=16, S=8)
+        sub_seq_x = split_forward_gather_backward(seq_x_, dist.group.WORLD, dim=2, grad_scale="down")
+        sub_seq_x = rearrange(sub_seq_x, "B T S C -> (B S) T C", T=16, S=8 // world_size)
+
+        sub_seq_out = seq_parallel_attention(sub_seq_x)
+
+        sub_seq_out = rearrange(sub_seq_out, "(B S) T C -> B T S C", T=16, S=8 // world_size)
+        seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=2, grad_scale="up")
+        seq_out = rearrange(seq_out, "B T S C -> B (T S) C", T=16, S=8)
+
+        x_ = rearrange(x, "B (T S) C -> (B S) T C", T=16, S=8)
+        out = attention(x_)
+        out = rearrange(out, "(B S) T C -> B (T S) C", T=16, S=8)
+
+    else:
+        sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
+        sub_seq_out = seq_parallel_attention(sub_seq_x)
+        seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
+
+        # run model
+        out = attention(x)
+    seq_out = seq_out.view(out.shape)
+
+    assert torch.allclose(seq_out, out, atol=1e-6), f"{seq_out.view(-1)[:10]}\nvs\n{out.view(-1)[:10]}"
+
+    # run backward
+    seq_out.mean().backward()
+    out.mean().backward()
+
+    # all reduce gradient for sp
+    for p in seq_parallel_attention.parameters():
+        if p.grad is not None:
+            dist.all_reduce(p.grad, group=dist.group.WORLD)
+            p.grad.div_(world_size)
+
+    # check grad
+    for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
+        assert torch.allclose(p1.grad, p2.grad, atol=1e-7), f"{p1.grad}\nvs\n{p2.grad}"
+
+    # check input grad
+    assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
+
+
+def run_cross_attention(rank, world_size):
+    # create model
+    torch.manual_seed(1024)
+    set_sequence_parallel_group(dist.group.WORLD)
+    seq_parallel_attention = (
+        SeqParallelMultiHeadCrossAttention(
+            d_model=256,
+            num_heads=4,
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
+
+    torch.manual_seed(1024)
+    attention = (
+        MultiHeadCrossAttention(
+            d_model=256,
+            num_heads=4,
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
+
+    # make sure the weights are the same
+    for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
+        p1.data.copy_(p2.data)
+
+    # create inputs
+    torch.manual_seed(1024)
+    x = torch.randn(4, 64, 256).cuda().to(torch.bfloat16)
+    y = torch.randn(4, 32, 256).cuda().to(torch.bfloat16)
+
+    mask = [2, 10, 8, 16]
+    mask = None
+    seq_x = x.clone().detach()
+    seq_y = y.clone().detach()
+
+    # set grad
+    x.requires_grad = True
+    x.retain_grad()
+    seq_x.requires_grad = True
+    seq_x.retain_grad()
+    y.requires_grad = True
+    y.retain_grad()
+    seq_y.requires_grad = True
+    seq_y.retain_grad()
+
+    # split by sequence
+    sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
+
+    # run model
+    out = attention(x, y, mask)
+    sub_seq_out = seq_parallel_attention(sub_seq_x, seq_y, mask)
+    seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
+
+    assert torch.allclose(seq_out, out, rtol=1e-5, atol=1e-6), f"\n{seq_out}\nvs\n{out}"
+
+    # run backward
+    seq_out.mean().backward()
+    out.mean().backward()
+
+    # all reduce gradient for sp
+    for name, p in seq_parallel_attention.named_parameters():
+        if p.grad is not None:
+            dist.all_reduce(p.grad, group=dist.group.WORLD)
+            p.grad.div_(world_size)
+        else:
+            print(f"grad of {name} is None")
+
+    # # check grad
+    for p1, p2 in zip(seq_parallel_attention.named_parameters(), attention.named_parameters()):
+        assert torch.allclose(
+            p1[1].grad, p2[1].grad, rtol=1e-3, atol=1e-4
+        ), f"\n{p1[0]}\nvs\n{p2[0]}:\n{p1[1].grad}\nvs\n{p2[1].grad}"
+
+    # # check input grad
+    assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
+    assert torch.allclose(y.grad, seq_y.grad, atol=1e-7), f"{y.grad}\nvs\n{seq_y.grad}"
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port)
+    run_attention(rank, world_size, temporal=True)
+    run_attention(rank, world_size, temporal=False)
+    run_attention(rank, world_size, kernel_size=(8, 8, -1), temporal=True)
+    run_attention(rank, world_size, kernel_size=(8, 8, -1), temporal=False)
+    # run_cross_attention(rank, world_size)
+
+
+def test_seq_parallel_attention():
+    spawn(run_dist, nprocs=4)
+
+
+if __name__ == "__main__":
+    test_seq_parallel_attention()
diff --git a/tests/test_stdit3_sequence_parallelism.py b/tests/test_stdit3_sequence_parallelism.py
new file mode 100644
index 0000000..70786f4
--- /dev/null
+++ b/tests/test_stdit3_sequence_parallelism.py
@@ -0,0 +1,109 @@
+import colossalai
+import torch
+import torch.distributed as dist
+from colossalai.testing import spawn
+from colossalai.utils.common import set_seed
+
+from opensora.acceleration.parallel_states import set_sequence_parallel_group
+from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config
+
+
+def get_sample_data():
+    x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16)  # (B, C, T, H, W)
+    timestep = torch.Tensor([924.0]).to(torch.bfloat16)
+    y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16)
+    mask = torch.ones([1, 300], dtype=torch.int32)
+    x_mask = torch.ones([1, 15]).bool()
+    fps = torch.Tensor([25.0]).to(torch.bfloat16)
+    height = torch.Tensor([166.0]).to(torch.bfloat16)
+    width = torch.Tensor([221.0]).to(torch.bfloat16)
+    return dict(x=x, timestep=timestep, y=y, mask=mask, x_mask=x_mask, fps=fps, height=height, width=width)
+
+
+def get_stdit3_config(enable_sequence_parallelism=False):
+    config = {
+        "caption_channels": 4096,
+        "class_dropout_prob": 0.0,
+        "depth": 1,
+        "drop_path": 0.0,
+        "enable_flash_attn": True,
+        "enable_layernorm_kernel": True,
+        "enable_sequence_parallelism": enable_sequence_parallelism,
+        "freeze_y_embedder": True,
+        "hidden_size": 1152,
+        "in_channels": 4,
+        "input_size": [None, None, None],
+        "input_sq_size": 512,
+        "mlp_ratio": 4.0,
+        "model_max_length": 300,
+        "model_type": "STDiT3",
+        "num_heads": 16,
+        "only_train_temporal": False,
+        "patch_size": [1, 2, 2],
+        "pred_sigma": True,
+        "qk_norm": True,
+        "skip_y_embedder": False,
+    }
+    return STDiT3Config(**config)
+
+
+def run_model(rank, world_size, port):
+    colossalai.launch({}, rank=rank, world_size=world_size, port=port, host="localhost")
+
+    # prepare data
+    data = get_sample_data()
+    data = {k: v.cuda() for k, v in data.items()}
+
+    # test single-gpu outptu
+    set_seed(1024)
+    non_dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=False)
+    non_dist_model = STDiT3(non_dist_model_cfg).cuda().to(torch.bfloat16)
+    non_dist_out = non_dist_model(**data)
+    non_dist_out.mean().backward()
+
+    # run seq parallelism
+    set_sequence_parallel_group(dist.group.WORLD)
+    set_seed(1024)
+    dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True)
+    dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16)
+
+    # ensure model weights are equal
+    for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
+        assert torch.equal(p1, p2)
+
+    # ensure model weights are equal across all ranks
+    for p in dist_model.parameters():
+        p_list = [torch.zeros_like(p) for _ in range(world_size)]
+        dist.all_gather(p_list, p, group=dist.group.WORLD)
+        assert torch.equal(*p_list)
+
+    dist_out = dist_model(**data)
+    dist_out.mean().backward()
+
+    # run all reduce for gradients
+    for param in dist_model.parameters():
+        if param.grad is not None:
+            dist.all_reduce(param.grad, group=dist.group.WORLD)
+            param.grad /= world_size
+
+    # ensure model weights are equal
+    for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
+        assert torch.equal(p1, p2)
+
+    # check
+    torch.testing.assert_close(non_dist_out, dist_out)
+    for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()):
+        assert n1 == n2
+        if p1.grad is not None and p2.grad is not None:
+            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0:
+                print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
+        else:
+            assert p1.grad is None and p2.grad is None
+
+
+def test_stdit3_sp():
+    spawn(run_model, 2)
+
+
+if __name__ == "__main__":
+    test_stdit3_sp()
diff --git a/tests/test_t5_shardformer.py b/tests/test_t5_shardformer.py
new file mode 100644
index 0000000..68040ab
--- /dev/null
+++ b/tests/test_t5_shardformer.py
@@ -0,0 +1,71 @@
+import time
+from copy import deepcopy
+
+import colossalai
+import torch
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.testing import spawn
+
+from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
+from opensora.models.text_encoder.t5 import T5Embedder
+
+
+def run_t5_encoder(rank, world_size, port):
+    colossalai.launch({}, rank=rank, world_size=world_size, port=port, host="localhost")
+
+    # t5 embedder
+    t5_path = "./pretrained_models/t5_ckpts"
+    hf_t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=t5_path, torch_dtype=torch.float)
+    sf_t5 = deepcopy(hf_t5)
+
+    # create huggingface model as normal
+    shard_config = ShardConfig(
+        tensor_parallel_process_group=None,
+        pipeline_stage_manager=None,
+        enable_tensor_parallelism=False,
+        enable_fused_normalization=False,
+        enable_flash_attention=False,
+        enable_jit_fused=True,
+        enable_sequence_parallelism=False,
+        enable_sequence_overlap=False,
+    )
+    shard_former = ShardFormer(shard_config=shard_config)
+    sharded_model, _ = shard_former.optimize(sf_t5.model, policy=T5EncoderPolicy())
+    sf_t5.model = sharded_model
+
+    # test t5 embedder
+    texts = ["Who is the best player in the history of NBA?", "How to study computer science?"]
+    for i in range(5):
+        hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
+        sf_embs, sf_masks = sf_t5.get_text_embeddings(texts)
+
+    # check accuracy
+    assert torch.allclose(hf_embs, sf_embs, rtol=1e-4, atol=1e-5), f"{hf_embs} \nvs\n{sf_embs}"
+    assert torch.allclose(hf_masks, sf_masks), f"{hf_masks} \nvs\n{sf_masks}"
+
+    # measure perf
+    torch.cuda.synchronize()
+    hf_start = time.time()
+    for i in range(20):
+        hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
+    torch.cuda.synchronize()
+    hf_end = time.time()
+
+    # convert sf to fp16
+    hf_t5.model = hf_t5.model.half()
+    torch.cuda.synchronize()
+    sf_start = time.time()
+    for i in range(20):
+        hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
+    torch.cuda.synchronize()
+    sf_end = time.time()
+
+    print(f"[Performance] native: {hf_end - hf_start}s, shardformer: {sf_end - sf_start} s")
+
+
+def test_t5_encoder():
+    spawn(run_t5_encoder)
+
+
+if __name__ == "__main__":
+    test_t5_encoder()
diff --git a/tests/test_tiled_conv3d.py b/tests/test_tiled_conv3d.py
new file mode 100644
index 0000000..852810e
--- /dev/null
+++ b/tests/test_tiled_conv3d.py
@@ -0,0 +1,52 @@
+import torch
+
+from opensora.models.layers.tiled_conv3d import TiledConv3d
+
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.deterministic = True
+
+
+def test_tiled_conv3d():
+    data = torch.rand(1, 128, 51, 256, 256).cuda().to(torch.bfloat16)
+
+    exclude_temporal_dim_options = [True, False]
+    padding_options = [
+        (0, 0, 0),
+        (0, 1, 1),
+        (1, 1, 1),
+    ]
+    stride_options = [(1, 1, 1), (2, 1, 1)]
+    kernel_size_options = [(1, 1, 1), (3, 3, 3)]
+    tile_size = 16
+
+    for padding in padding_options:
+        for stride in stride_options:
+            for kernel_size in kernel_size_options:
+                for exclude_temporal_dim in exclude_temporal_dim_options:
+                    conv3d = (
+                        torch.nn.Conv3d(128, 128, kernel_size=kernel_size, stride=stride, padding=padding)
+                        .cuda()
+                        .to(torch.bfloat16)
+                    )
+                    auto_tiled_conv3d = TiledConv3d.from_native_conv3d(
+                        conv3d, tile_size=tile_size, exclude_temporal_dim=exclude_temporal_dim
+                    )
+
+                    # compare
+                    with torch.inference_mode():
+                        out = conv3d(data)
+                        merged_out = auto_tiled_conv3d(data)
+
+                    print(f"max allocated: {torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024:.2f} GB")
+                    print(f"max reserved: {torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024:.2f} GB")
+
+                    try:
+                        torch.testing.assert_close(out, merged_out)
+                    except Exception as e:
+                        print(
+                            f"Failed with padding={padding}, stride={stride}, kernel_size={kernel_size}, exclude_temporal_dim={exclude_temporal_dim}, error: {e}"
+                        )
+
+
+if __name__ == "__main__":
+    test_tiled_conv3d()
diff --git a/tests/test_video_io.py b/tests/test_video_io.py
new file mode 100644
index 0000000..73c3ecd
--- /dev/null
+++ b/tests/test_video_io.py
@@ -0,0 +1,163 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+from mmengine.runner import set_random_seed
+from tqdm import tqdm
+
+from opensora.datasets import video_transforms
+from opensora.datasets.utils import read_file
+from opensora.utils.misc import to_torch_dtype
+
+# data_path = "~/data/issue.csv"
+data_path = "~/data/test.csv"
+save_dir = "samples/debug_original_video_read_write"
+num_frames = 17
+frame_interval = 1
+image_size = 1024
+
+set_random_seed(1024)
+os.makedirs(save_dir, exist_ok=True)
+data = read_file(data_path)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = to_torch_dtype("bf16")
+
+
+def temporal_random_crop(vframes, num_frames, frame_interval):
+    temporal_sample = video_transforms.TemporalRandomCrop(num_frames * frame_interval)
+    total_frames = len(vframes)
+    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
+    assert (
+        end_frame_ind - start_frame_ind >= num_frames
+    ), f"Not enough frames to sample, {end_frame_ind} - {start_frame_ind} < {num_frames}"
+    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, num_frames, dtype=int)
+    video = vframes[frame_indice]
+    return video
+
+
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+    """
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    # breakpoint()
+    return clip.float() / 255.0
+
+
+def read_video_cv2(video_path):
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        raise ValueError
+    else:
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        vinfo = {
+            "video_fps": fps,
+        }
+
+        frames = []
+        while True:
+            # Read a frame from the video
+            ret, frame = cap.read()
+
+            # If frame is not read correctly, break the loop
+            if not ret:
+                break
+
+            # frames.append(frame[:, :, ::-1])  # BGR to RGB
+            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            # Exit if 'q' is pressed
+            if cv2.waitKey(25) & 0xFF == ord("q"):
+                break
+
+        # Release the video capture object and close all windows
+        cap.release()
+        cv2.destroyAllWindows()
+
+        frames = np.stack(frames)
+        frames = torch.from_numpy(frames)  # [T, H, W, C=3]
+        frames = frames.permute(0, 3, 1, 2)
+        return frames, vinfo
+
+
+def write_video_cv2(path, video, fps=24, image_size=(1920, 1080)):
+    # Set the video codec and create a VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    output = cv2.VideoWriter(path, fourcc, fps, image_size)
+
+    for frame_idx in range(video.size(0)):
+        frame = np.array(video[frame_idx].permute(1, 2, 0))
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        output.write(frame)
+
+    output.release()
+
+
+for video_path in tqdm(data["path"]):
+    name = os.path.basename(video_path)
+
+    # # DEBUG: read image and save as if video: no issue
+    # image = cv2.imread('/home/shenchenhui/data/ship-in-coffee-image.png')
+    # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    # image = torch.Tensor(image)
+    # fake_vid = image.repeat(48, 1, 1, 1)
+    # write_video(f"{save_dir}/fake.mp4", fake_vid, fps=24, video_codec="h264")
+
+    # # ===== data loading ====== #
+    # # loading
+    # vframes, vinfo = read_video(video_path, backend="cv2")
+    vframes, vinfo = read_video_cv2(video_path)
+    video_fps = vinfo["video_fps"] if "video_fps" in vinfo else 24
+    print("fps:", video_fps)
+    # # Sampling video frames
+    video = vframes
+    video = temporal_random_crop(vframes, num_frames, frame_interval)  # not this issue
+    # # breakpoint()
+
+    # video = to_tensor(video)
+
+    # # # transform
+    # # transform_video = transforms.Compose(
+    # #     [
+    # #         # video_transforms.ToTensorVideo(),  # moved up
+    # #         # video_transforms.UCFCenterCropVideo(image_size), # not this issue
+    # #         transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True), # not this issues
+    # #     ]
+    # # )
+    # # video = transform_video(video)  # T C H W
+
+    ### write each frame as image
+    # for frame_idx in range(video.size(0)):
+    #     frame = np.array(video[frame_idx].permute(1,2,0))
+    #     frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+    #     cv2.imwrite(f'{save_dir}/{frame_idx}.jpg', frame)
+
+    # # # TCHW -> CTHW
+    # video = video.permute(1, 0, 2, 3)
+
+    # # # # ===== model training ====== #
+    # # # video = video.to(device, dtype)
+
+    # # # ===== data saving ====== #
+    # # # # Normalize
+    # # # low, high = -1,1
+    # # # video.clamp_(min=low, max=high)
+    # # # video.sub_(low).div_(max(high - low, 1e-5))
+
+    # # # video = video.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8)
+    # # # breakpoint()
+    # # # video = video.permute(1, 2, 3, 0).to("cpu", torch.uint8)
+    # video = video.permute(1, 2, 3, 0)
+
+    #
+    write_video_cv2(f"{save_dir}/{name}", video, fps=video_fps)
+    # # prep to [T, H, W, C] in order to write
+    # write_video(f"{save_dir}/{name}", video, fps=video_fps, video_codec="h264")
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/caption/README.md b/tools/caption/README.md
new file mode 100644
index 0000000..0e51b71
--- /dev/null
+++ b/tools/caption/README.md
@@ -0,0 +1,253 @@
+# Video Captioning
+
+Human labeling of videos is expensive and time-consuming. We adopt powerful image captioning models to generate captions for videos. Although GPT-4V achieves a better performance, its 20s/sample speed is too slow for us. As for our v1.2 model, we captioned our training videos with the [PLLaVA](https://github.com/magic-research/PLLaVA) model. PLLaVA performs highly competitively on multiple video-based text generation benchmarks including [MVbench](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=pllava-parameter-free-llava-extension-from-1).
+
+## PLLaVA Captioning
+
+To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel.
+
+### Installation
+Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections.
+
+
+<!-- ### Download the PLLaVA repo
+
+First, make sure you are under the directory of tools/caption/pllava_dir. Then,
+
+```bash
+git clone https://github.com/magic-research/PLLaVA.git
+cd PLLaVA
+git checkout fd9194a
+
+
+```
+
+### Environment
+
+```bash
+conda create -n pllava python=3.10
+
+conda activate pllava
+
+pip install -r requirements.txt # change to your own torch version if neccessary; torch==2.2.2, torchaudio==2.2.2, torchvision==0.17.2 worked for H100 for Tom.
+
+```
+
+
+### Download weights
+
+```bash
+python python_scripts/hf.py # download the weights
+``` -->
+### Usage
+
+Since PLLaVA is not fashioned as a package, we will use PYTHONPATH to use it.
+
+
+```bash
+cd .. # step back to pllava_dir
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+PYTHONPATH='$PYTHONPATH:OPEN_SORA_HOME/tools/caption/pllava_dir/PLLaVA' \
+nohup torchrun --nproc_per_node 8 --standalone caption_pllava.py \
+  --pretrained_model_name_or_path PLLaVA/MODELS/pllava-13b \
+  --use_lora \
+  --lora_alpha 4 \
+  --num_frames 4 \
+  --weight_dir PLLaVA/MODELS/pllava-13b \
+  --csv_path meta.csv \
+  --pooling_shape 4-12-12 \
+  > pllava_caption.out 2>&1 &
+```
+
+### PLLaVA vs. LLaVA
+In our previous releases, we used [LLaVA](#llava-captioning) for video captioning.
+Qualitatively speaking, we observe that PLLaVA has a somewhat higher chance of accurately capture the details in the video than LLaVA. See below for their comparison on a video sample.
+<!-- <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/llava_vs_pllava_sample.gif" width="300" height="200" alt="LLaVA vs PLLaVA"> -->
+
+<figure>
+    <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/llava_vs_pllava_sample.gif" width="300" height="200" alt="LLaVA vs PLLaVA">
+</figure>
+
+
+
+     | LLaVA                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | PLLaVA                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+-----|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----
+     | The video is a close-up shot of two gold wedding rings. The rings are placed on a (black surface)✅, casting a soft shadow beneath them. The rings are positioned in such a way that (they are facing each other)❌, creating a mirror image effect. The rings are (identical in size and design)✅, suggesting they are a pair. The lighting in the video is soft and diffused, highlighting the gold color of the rings and creating a warm and inviting atmosphere. The overall style of the video is minimalist and elegant, focusing solely on the rings and their reflection. | The video shows a pair of gold wedding rings on a (reflective surface)✅. The rings are placed one on top of the other, (with the top ring slightly tilted to the left)✅. The rings have a (shiny, metallic finish)✅ and are the main focus of the image. The background is a gradient of dark to light gray, providing a neutral backdrop that highlights the rings. There are no texts or other objects in the image. The style of the video is a simple product display with a focus on the rings, likely intended for promotional or sales purposes. The lighting and shadows suggest a soft, even light source, (possibly a studio light)✅, which creates a reflective surface beneath the rings. |
+<!-- | Row2Cell1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | Row2Cell2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | -->
+
+
+
+
+## LLaVA Captioning
+
+We extract three frames from the video for captioning. With batch inference, we can achieve 10 times speedup. With approximately 720p resolution and 1 frames, the speed is 2~3 videos/s on 8 GPUs. If we resize the smaller side to 336, the speed can be 8 videos/s. In Open-Sora v1.1, to lower the cost, we use the 7B model.
+
+### Installation
+
+Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "LLaVA Captioning" sections.
+
+<!-- ### Requirement
+
+```bash
+# create conda env
+conda create -n llava python=3.10 -y
+conda activate llava
+
+# install torch
+pip install torch torchvision
+
+# clone llava
+git clone https://github.com/haotian-liu/LLaVA.git
+cd LLaVA
+# CAUTION: This line is to remove torch dependency in pyproject.toml, which is:
+# "torch==2.1.2", "torchvision==0.16.2",
+# It is better manually remove it in your local pyproject.toml
+sed -i '16d' pyproject.toml
+
+# install llava
+pip install --upgrade pip  # enable PEP 660 support
+pip install -e .
+
+# install flash attention
+pip install flash-attn --no-build-isolation
+# install colossalai and decord
+pip install colossalai decord
+``` -->
+
+### Usage
+
+Prepare a csv file for processing. The csv file can be generated by `convert_dataset.py` according to its [documentation](/tools/datasets/README.md). Then, run the following command to generate captions for videos/images with Llava:
+
+```bash
+# caption with mistral-7B
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
+
+# caption with llava-34B
+# NOTE: remember to enable flash attention for this model
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 4 --tp-size 2 --model-path liuhaotian/llava-v1.6-34b --prompt image-3ex --flash-attention
+
+# we run this on 8xH800 GPUs
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 4 --bs 16
+
+# at least two 80G GPUs are required
+torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16
+
+# can also caption images
+torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16 --prompt image-3ex
+```
+
+Please note that you should add the `--flash-attention` flag when running with Llama-based Llava models as it provides speedup but do turn it off for mistral-based ones. Reasons can be found in [this issue](https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453).
+
+After running the script, with `dp-size=N`, you will get `N` parts of csv files. Run the following command to merge them:
+
+```bash
+python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
+```
+
+### Resume
+
+Sometimes the process may be interrupted. We can resume the process by running the following command:
+
+```bash
+# merge generated results
+python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
+
+# get the remaining videos
+python -m tools.datasets.datautil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv
+```
+
+Then use the output csv file to resume the process.
+
+
+## GPT-4V Captioning
+
+Run the following command to generate captions for videos with GPT-4V:
+
+```bash
+# output: DATA_caption.csv
+python -m tools.caption.caption_gpt4 DATA.csv --key $OPENAI_API_KEY
+```
+
+The cost is approximately $0.01 per video (3 frames per video).
+
+## Camera Motion Detection
+
+<!-- Install additional required packages: `tools/caption/camera_motion/requirements.txt`. -->
+Install required packages with `pip install -v .[data]` (See [installation.md](../../docs/installation.md)).
+Run the following command to classify camera motion:
+
+```bash
+# output: meta_cmotion.csv
+python -m tools.caption.camera_motion.detect tools/caption/camera_motion/meta.csv
+```
+
+You may additionally specify `threshold` to indicate how "sensitive" the detection should be as below. For example `threshold = 0.2` means that the video is only counted as `tilt_up` when the pixels moved down by `>20%` of video height between the starting and ending frames.
+```bash
+# output: meta_cmotion.csv
+python -m tools.caption.camera_motion.detect tools/caption/camera_motion/meta.csv --threshold 0.2
+```
+
+Each video is classified according to 8 categories:
+            `pan_right,
+            pan_left,
+            tilt_up,
+            tilt_down,
+            zoom_in,
+            zoom_out,
+            static,
+            unclassified`.
+Categories of `tilt`, `pan` and `zoom` can overlap with each other.
+
+
+## Tagging with Llama3
+
+To understand the overall category distribution of our training dataset, we use Llama3 to generate tags based on the video captions.
+
+After obtaining Llama3 usage permission from huggingface/meta, you may generate tags based on the captions using Llama3 like this:
+
+```bash
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llama3 meta.csv --key objects --output_prefix meta
+```
+
+This will generate tags based on the `text` column of `meta.csv` and put the results to `output_prefix + key.csv`. Currently the prompts for `objects` and `actions` are supported.
+
+
+
+## LLaVA-Next Captioning
+LLaVA-NeXT-Video-32B-Qwen, based on Qwen-32B, improves captioning accuracy and linguistic diversity through enhanced language understanding and better visual-text alignment. Using 32 video frames, it captures detailed spatial information for more contextually accurate captions. To optimize efficiency, we batch frames for GPU utilization and employ multi-threaded frame extraction to run in parallel with GPU computations, preventing bottlenecks. Loading in 8-bits is currently buggy and needs to be fixed.
+
+### Installation
+
+You need to install VllaVA
+
+```
+# install VLLaVA
+git clone https://github.com/LLaVA-VL/LLaVA-NeXT
+cd LLaVA-NeXT
+conda create -n llava python=3.10 -y
+conda activate llava
+pip install --upgrade pip  # Enable PEP 660 support.
+pip install -e ".[train]"
+
+# we use fixed transformers version
+pip install transformers==0.40.0
+```
+
+### Usage
+
+The script takes one csv dataset file and lauch World_Size processes, each handle one split of the caption jobs.
+
+```
+CUDA_VISIBLE_DEVICES=1,2,3,4 python3 caption_llava_next.py \
+    --model-path /PATH/TO/LLaVA-NeXT-Video-32B-Qwen \
+    --data_file /PATH/TO/data.csv \
+    --output_folder /PATH/TO/output_dir \
+    --overwrite true \
+    --mm_spatial_pool_stride 2 \
+    --for_get_frames_num 32 \
+    --conv-mode qwen_2 \
+    --mm_spatial_pool_mode average \
+    --mm_newline_position grid \
+    --prompt "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes."
+```
diff --git a/tools/caption/__init__.py b/tools/caption/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/caption/acceleration/__init__.py b/tools/caption/acceleration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/caption/acceleration/llava/__init__.py b/tools/caption/acceleration/llava/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/caption/acceleration/llava/policies/__init__.py b/tools/caption/acceleration/llava/policies/__init__.py
new file mode 100644
index 0000000..35998d4
--- /dev/null
+++ b/tools/caption/acceleration/llava/policies/__init__.py
@@ -0,0 +1,2 @@
+from .llama import LlavaLlamaForCausalLMPolicy
+from .mistral import LlavaMistralForCausalLMPolicy
diff --git a/tools/caption/acceleration/llava/policies/llama.py b/tools/caption/acceleration/llava/policies/llama.py
new file mode 100644
index 0000000..dff8f01
--- /dev/null
+++ b/tools/caption/acceleration/llava/policies/llama.py
@@ -0,0 +1,98 @@
+from typing import Dict, Union
+
+import torch.nn as nn
+from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = ["LlavaLlamaPolicy", "LlavaLlamaForCausalLMPolicy"]
+
+
+class LlavaLlamaPolicy(Policy):
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        if self.shard_config.enable_tensor_parallelism:
+            # Resize embedding
+            self.model.config.vocab_size
+            self.shard_config.tensor_parallel_size
+
+            # if vocab_size % world_size != 0:
+            #     new_vocab_size = vocab_size + world_size - vocab_size % world_size
+            #     self.model.resize_token_embeddings(new_vocab_size)
+
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+            }
+            if getattr(self.model.config, "num_key_value_heads", False):
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
+                    self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size
+                )
+
+            policy[LlamaDecoderLayer] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.q_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.k_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.v_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.o_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.gate_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.up_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.down_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                ],
+            )
+
+        return policy
+
+    def postprocess(self):
+        return self.model
+
+
+class LlavaLlamaForCausalLMPolicy(LlavaLlamaPolicy):
+    def module_policy(self):
+        from transformers import LlamaForCausalLM
+
+        policy = super().module_policy()
+        if self.shard_config.enable_tensor_parallelism:
+            # add a new item for casual lm
+            new_item = {
+                LlamaForCausalLM: ModulePolicyDescription(
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": True}
+                        )
+                    ],
+                )
+            }
+            policy.update(new_item)
+        return policy
diff --git a/tools/caption/acceleration/llava/policies/mistral.py b/tools/caption/acceleration/llava/policies/mistral.py
new file mode 100644
index 0000000..0afea57
--- /dev/null
+++ b/tools/caption/acceleration/llava/policies/mistral.py
@@ -0,0 +1,113 @@
+import warnings
+from typing import Dict, Union
+
+import torch.nn as nn
+from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = ["LlavaMistralPolicy", "LlavaMistralForCausalLMPolicy"]
+
+
+class LlavaMistralPolicy(Policy):
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        if self.shard_config.enable_tensor_parallelism:
+            # Resize embedding
+            vocab_size = self.model.config.vocab_size
+            world_size = self.shard_config.tensor_parallel_size
+
+            if vocab_size % world_size != 0:
+                new_vocab_size = vocab_size + world_size - vocab_size % world_size
+                self.model.resize_token_embeddings(new_vocab_size)
+
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralModel
+
+        policy = {}
+
+        if self.shard_config.enable_sequence_parallelism:
+            self.shard_config.enable_sequence_parallelism = False
+            warnings.warn(
+                "Mistral doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
+            )
+
+        if self.shard_config.enable_tensor_parallelism:
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
+                // self.shard_config.tensor_parallel_size,
+            }
+
+            policy[MistralDecoderLayer] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.q_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.k_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.v_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.o_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.gate_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.up_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.down_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                ],
+            )
+
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="embed_tokens",
+                    target_module=VocabParallelEmbedding1D,
+                ),
+                policy=policy,
+                target_key=MistralModel,
+            )
+
+        return policy
+
+    def postprocess(self):
+        return self.model
+
+
+class LlavaMistralForCausalLMPolicy(LlavaMistralPolicy):
+    def module_policy(self):
+        from transformers import MistralForCausalLM
+
+        policy = super().module_policy()
+
+        if self.shard_config.enable_tensor_parallelism:
+            # add a new item for casual lm
+            new_item = {
+                MistralForCausalLM: ModulePolicyDescription(
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                        )
+                    ]
+                )
+            }
+            policy.update(new_item)
+        return policy
diff --git a/tools/caption/camera_motion/__init__.py b/tools/caption/camera_motion/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/caption/camera_motion/camera_motion.py b/tools/caption/camera_motion/camera_motion.py
new file mode 100644
index 0000000..9f283cb
--- /dev/null
+++ b/tools/caption/camera_motion/camera_motion.py
@@ -0,0 +1,146 @@
+import os
+
+import numpy as np
+import torch
+
+from .utils import load_video
+from .visualizer import Visualizer
+
+
+def transform(vector):
+    x = np.mean([item[0] for item in vector])
+    y = np.mean([item[1] for item in vector])
+    return [x, y]
+
+
+class CameraPredict:
+    def __init__(self, device, submodules_list, factor=0.25):
+        self.device = device
+        self.grid_size = 10
+        self.factor = factor
+        try:
+            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)
+        except:
+            # workaround for CERTIFICATE_VERIFY_FAILED (see: https://github.com/pytorch/pytorch/issues/33288#issuecomment-954160699)
+            import ssl
+
+            ssl._create_default_https_context = ssl._create_unverified_context
+            self.model = torch.hub.load(submodules_list["repo"], submodules_list["model"]).to(self.device)
+
+    def infer(self, video_path, save_video=False, save_dir="./saved_videos"):
+        # load video
+        video = load_video(video_path, return_tensor=False)
+        # set scale
+        height, width = video.shape[1], video.shape[2]
+        self.scale = min(height, width)
+        video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float().to(self.device)  # B T C H W
+        pred_tracks, pred_visibility = self.model(video, grid_size=self.grid_size)  # B T N 2,  B T N 1
+
+        if save_video:
+            video_name = os.path.basename(video_path)[:-4]
+            vis = Visualizer(save_dir=save_dir, pad_value=120, linewidth=3)
+            vis.visualize(video, pred_tracks, pred_visibility, filename=video_name)
+
+        return pred_tracks[0].long().detach().cpu().numpy()
+
+    def transform_class(self, vector, min_reso):  # 768*0.05
+        scale = min_reso * self.factor
+        x, y = vector
+        direction = []
+        if x > scale:
+            direction.append("right")
+        elif x < -scale:
+            direction.append("left")
+
+        if y > scale:
+            direction.append("down")
+        elif y < -scale:
+            direction.append("up")
+
+        return direction if direction else ["static"]
+
+    def get_edge_point(self, track):
+        middle = self.grid_size // 2
+        top = [list(track[0, i, :]) for i in range(middle - 2, middle + 2)]
+        down = [list(track[self.grid_size - 1, i, :]) for i in range(middle - 2, middle + 2)]
+        left = [list(track[i, 0, :]) for i in range(middle - 2, middle + 2)]
+        right = [list(track[i, self.grid_size - 1, :]) for i in range(middle - 2, middle + 2)]
+
+        return top, down, left, right
+
+    def get_edge_direction(self, track1, track2):
+        edge_points1 = self.get_edge_point(track1)
+        edge_points2 = self.get_edge_point(track2)
+
+        vector_results = []
+        for points1, points2 in zip(edge_points1, edge_points2):
+            vectors = [[end[0] - start[0], end[1] - start[1]] for start, end in zip(points1, points2)]
+            vector_results.append(vectors)
+        vector_results = list(map(transform, vector_results))
+        class_results = [self.transform_class(vector, min_reso=self.scale) for vector in vector_results]
+
+        return class_results
+
+    def classify_top_down(self, top, down):
+        results = []
+        classes = [f"{item_t}_{item_d}" for item_t in top for item_d in down]
+
+        results_mapping = {
+            "left_left": "pan_right",
+            "right_right": "pan_left",
+            "down_down": "tilt_up",
+            "up_up": "tilt_down",
+            "up_down": "zoom_in",
+            "down_up": "zoom_out",
+            "static_static": "static",
+        }
+        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
+        return results if results else ["None"]
+
+    def classify_left_right(self, left, right):
+        results = []
+        classes = [f"{item_l}_{item_r}" for item_l in left for item_r in right]
+        results_mapping = {
+            "left_left": "pan_right",
+            "right_right": "pan_left",
+            "down_down": "tilt_up",
+            "up_up": "tilt_down",
+            "left_right": "zoom_in",
+            "right_left": "zoom_out",
+            "static_static": "static",
+        }
+        results = [results_mapping.get(cls) for cls in classes if cls in results_mapping]
+        return results if results else ["None"]
+
+    def camera_classify(self, track1, track2):
+        top, down, left, right = self.get_edge_direction(track1, track2)
+
+        top_results = self.classify_top_down(top, down)
+        left_results = self.classify_left_right(left, right)
+
+        results = list(set(top_results + left_results))
+        if "None" in results and len(results) > 1:
+            results.remove("None")
+        if "static" in results and len(results) > 1:
+            results.remove("static")
+        if len(results) == 1 and results[0] == "None":  # Tom added this to deal with edge cases
+            results = ["Undetermined"]
+        return results
+
+    def predict(self, video_path):
+        pred_track = self.infer(video_path)
+        track1 = pred_track[0].reshape((self.grid_size, self.grid_size, 2))
+        track2 = pred_track[-1].reshape((self.grid_size, self.grid_size, 2))
+        results = self.camera_classify(track1, track2)
+        return results
+
+
+def compute_camera_motion(device, submodules_dict, video_paths, factor):
+    camera = CameraPredict(device, submodules_dict, factor)
+    # predict_results = camera.predict(video_path)
+    # return predict_results
+    all_predictions = []
+    for video_path in video_paths:
+        camera_motion_types = camera.predict(video_path)
+        all_predictions.append("+".join(camera_motion_types))
+    return all_predictions
diff --git a/tools/caption/camera_motion/detect.py b/tools/caption/camera_motion/detect.py
new file mode 100644
index 0000000..7e7eff8
--- /dev/null
+++ b/tools/caption/camera_motion/detect.py
@@ -0,0 +1,31 @@
+# Originally developed by https://github.com/Vchitect/VBench based on https://github.com/facebookresearch/co-tracker.
+
+import argparse
+from typing import List
+
+import pandas as pd
+
+from .camera_motion import compute_camera_motion
+
+
+def process(paths: List[str], threshold: float) -> List[str]:
+    device = "cuda"
+    submodules = {"repo": "facebookresearch/co-tracker", "model": "cotracker2"}
+    camera_motion_types = compute_camera_motion(device, submodules, paths, factor=threshold)
+    return camera_motion_types
+
+
+def main(args):
+    output_file = args.input.replace(".csv", "_cmotion.csv")
+    data = pd.read_csv(args.input)
+    data["cmotion"] = process(data["path"], args.threshold)
+    data.to_csv(output_file, index=False)
+    print(f"Output saved to {output_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str)
+    parser.add_argument("--threshold", type=float, default=0.25)
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/caption/camera_motion/requirements.txt b/tools/caption/camera_motion/requirements.txt
new file mode 100644
index 0000000..b699a95
--- /dev/null
+++ b/tools/caption/camera_motion/requirements.txt
@@ -0,0 +1,3 @@
+decord
+ptvsd
+imageio-ffmpeg
diff --git a/tools/caption/camera_motion/utils.py b/tools/caption/camera_motion/utils.py
new file mode 100644
index 0000000..15426ea
--- /dev/null
+++ b/tools/caption/camera_motion/utils.py
@@ -0,0 +1,112 @@
+import numpy as np
+import torch
+from decord import VideoReader
+from PIL import Image, ImageSequence
+
+
+def get_frame_indices(num_frames, vlen, sample="rand", fix_start=None, input_fps=1, max_num_frames=-1):
+    if sample in ["rand", "middle"]:  # uniform sampling
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if sample == "rand":
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif sample == "middle":
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[: len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+    elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
+            frame_indices = frame_indices[:max_num_frames]
+            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
+    else:
+        raise ValueError
+    return frame_indices
+
+
+def load_video(video_path, data_transform=None, num_frames=None, return_tensor=True, width=None, height=None):
+    """
+    Load a video from a given path and apply optional data transformations.
+
+    The function supports loading video in GIF (.gif), PNG (.png), and MP4 (.mp4) formats.
+    Depending on the format, it processes and extracts frames accordingly.
+
+    Parameters:
+    - video_path (str): The file path to the video or image to be loaded.
+    - data_transform (callable, optional): A function that applies transformations to the video data.
+
+    Returns:
+    - frames (torch.Tensor): A tensor containing the video frames with shape (T, C, H, W),
+      where T is the number of frames, C is the number of channels, H is the height, and W is the width.
+
+    Raises:
+    - NotImplementedError: If the video format is not supported.
+
+    The function first determines the format of the video file by its extension.
+    For GIFs, it iterates over each frame and converts them to RGB.
+    For PNGs, it reads the single frame, converts it to RGB.
+    For MP4s, it reads the frames using the VideoReader class and converts them to NumPy arrays.
+    If a data_transform is provided, it is applied to the buffer before converting it to a tensor.
+    Finally, the tensor is permuted to match the expected (T, C, H, W) format.
+    """
+    if video_path.endswith(".gif"):
+        frame_ls = []
+        img = Image.open(video_path)
+        for frame in ImageSequence.Iterator(img):
+            frame = frame.convert("RGB")
+            frame = np.array(frame).astype(np.uint8)
+            frame_ls.append(frame)
+        buffer = np.array(frame_ls).astype(np.uint8)
+    elif video_path.endswith(".png"):
+        frame = Image.open(video_path)
+        frame = frame.convert("RGB")
+        frame = np.array(frame).astype(np.uint8)
+        frame_ls = [frame]
+        buffer = np.array(frame_ls)
+    elif video_path.endswith(".mp4"):
+        import decord
+
+        decord.bridge.set_bridge("native")
+        if width:
+            video_reader = VideoReader(video_path, width=width, height=height, num_threads=1)
+        else:
+            video_reader = VideoReader(video_path, num_threads=1)
+        frames = video_reader.get_batch(range(len(video_reader)))  # (T, H, W, C), torch.uint8
+
+        buffer = frames.asnumpy().astype(np.uint8)
+    else:
+        raise NotImplementedError
+
+    frames = buffer
+    if num_frames:
+        frame_indices = get_frame_indices(num_frames, len(frames), sample="middle")
+        frames = frames[frame_indices]
+
+    if data_transform:
+        frames = data_transform(frames)
+    elif return_tensor:
+        frames = torch.Tensor(frames)
+        frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
+
+    return frames
diff --git a/tools/caption/camera_motion/visualizer.py b/tools/caption/camera_motion/visualizer.py
new file mode 100644
index 0000000..f4f2dc6
--- /dev/null
+++ b/tools/caption/camera_motion/visualizer.py
@@ -0,0 +1,339 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the cotracker github repo. https://github.com/facebookresearch/co-tracker.
+import os
+
+import imageio
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+from matplotlib import cm
+from PIL import Image, ImageDraw
+
+
+def read_video_from_path(path):
+    try:
+        reader = imageio.get_reader(path)
+    except Exception as e:
+        print("Error opening video file: ", e)
+        return None
+    frames = []
+    for i, im in enumerate(reader):
+        frames.append(np.array(im))
+    return np.stack(frames)
+
+
+def draw_circle(rgb, coord, radius, color=(255, 0, 0), visible=True):
+    # Create a draw object
+    draw = ImageDraw.Draw(rgb)
+    # Calculate the bounding box of the circle
+    left_up_point = (coord[0] - radius, coord[1] - radius)
+    right_down_point = (coord[0] + radius, coord[1] + radius)
+    # Draw the circle
+    draw.ellipse(
+        [left_up_point, right_down_point],
+        fill=tuple(color) if visible else None,
+        outline=tuple(color),
+    )
+    return rgb
+
+
+def draw_line(rgb, coord_y, coord_x, color, linewidth):
+    draw = ImageDraw.Draw(rgb)
+    draw.line(
+        (coord_y[0], coord_y[1], coord_x[0], coord_x[1]),
+        fill=tuple(color),
+        width=linewidth,
+    )
+    return rgb
+
+
+def add_weighted(rgb, alpha, original, beta, gamma):
+    return (rgb * alpha + original * beta + gamma).astype("uint8")
+
+
+class Visualizer:
+    def __init__(
+        self,
+        save_dir: str = "./results",
+        grayscale: bool = False,
+        pad_value: int = 0,
+        fps: int = 10,
+        mode: str = "rainbow",  # 'cool', 'optical_flow'
+        linewidth: int = 2,
+        show_first_frame: int = 10,
+        tracks_leave_trace: int = 0,  # -1 for infinite
+    ):
+        self.mode = mode
+        self.save_dir = save_dir
+        if mode == "rainbow":
+            self.color_map = cm.get_cmap("gist_rainbow")
+        elif mode == "cool":
+            self.color_map = cm.get_cmap(mode)
+        self.show_first_frame = show_first_frame
+        self.grayscale = grayscale
+        self.tracks_leave_trace = tracks_leave_trace
+        self.pad_value = pad_value
+        self.linewidth = linewidth
+        self.fps = fps
+
+    def visualize(
+        self,
+        video: torch.Tensor,  # (B,T,C,H,W)
+        tracks: torch.Tensor,  # (B,T,N,2)
+        visibility: torch.Tensor = None,  # (B, T, N, 1) bool
+        gt_tracks: torch.Tensor = None,  # (B,T,N,2)
+        segm_mask: torch.Tensor = None,  # (B,1,H,W)
+        filename: str = "video",
+        writer=None,  # tensorboard Summary Writer, used for visualization during training
+        step: int = 0,
+        query_frame: int = 0,
+        save_video: bool = True,
+        compensate_for_camera_motion: bool = False,
+    ):
+        if compensate_for_camera_motion:
+            assert segm_mask is not None
+        if segm_mask is not None:
+            coords = tracks[0, query_frame].round().long()
+            segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long()
+
+        video = F.pad(
+            video,
+            (self.pad_value, self.pad_value, self.pad_value, self.pad_value),
+            "constant",
+            255,
+        )
+        print("video shape after pad is: ", video.shape)
+        tracks = tracks + self.pad_value
+
+        print(tracks)
+        print("tracks shape after pad is: ", tracks.shape)
+
+        if self.grayscale:
+            transform = transforms.Grayscale()
+            video = transform(video)
+            video = video.repeat(1, 1, 3, 1, 1)
+
+        res_video = self.draw_tracks_on_video(
+            video=video,
+            tracks=tracks,
+            visibility=visibility,
+            segm_mask=segm_mask,
+            gt_tracks=gt_tracks,
+            query_frame=query_frame,
+            compensate_for_camera_motion=compensate_for_camera_motion,
+        )
+        if save_video:
+            self.save_video(res_video, filename=filename, writer=writer, step=step)
+        return res_video
+
+    def save_video(self, video, filename, writer=None, step=0):
+        if writer is not None:
+            writer.add_video(
+                filename,
+                video.to(torch.uint8),
+                global_step=step,
+                fps=self.fps,
+            )
+        else:
+            os.makedirs(self.save_dir, exist_ok=True)
+            wide_list = list(video.unbind(1))
+            wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list]
+
+            # Prepare the video file path
+            save_path = os.path.join(self.save_dir, f"{filename}.mp4")
+
+            # Create a writer object
+            video_writer = imageio.get_writer(save_path, fps=self.fps)
+
+            # Write frames to the video file
+            for frame in wide_list[2:-1]:
+                video_writer.append_data(frame)
+
+            video_writer.close()
+
+            print(f"Video saved to {save_path}")
+
+    def draw_tracks_on_video(
+        self,
+        video: torch.Tensor,
+        tracks: torch.Tensor,
+        visibility: torch.Tensor = None,
+        segm_mask: torch.Tensor = None,
+        gt_tracks=None,
+        query_frame: int = 0,
+        compensate_for_camera_motion=False,
+    ):
+        B, T, C, H, W = video.shape
+        _, _, N, D = tracks.shape
+
+        assert D == 2
+        assert C == 3
+        video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy()  # S, H, W, C
+        tracks = tracks[0].long().detach().cpu().numpy()  # S, N, 2
+        if gt_tracks is not None:
+            gt_tracks = gt_tracks[0].detach().cpu().numpy()
+
+        res_video = []
+
+        # process input video
+        for rgb in video:
+            res_video.append(rgb.copy())
+        vector_colors = np.zeros((T, N, 3))
+
+        if self.mode == "optical_flow":
+            import flow_vis
+
+            vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None])
+        elif segm_mask is None:
+            if self.mode == "rainbow":
+                y_min, y_max = (
+                    tracks[query_frame, :, 1].min(),
+                    tracks[query_frame, :, 1].max(),
+                )
+                norm = plt.Normalize(y_min, y_max)
+                for n in range(N):
+                    color = self.color_map(norm(tracks[query_frame, n, 1]))
+                    color = np.array(color[:3])[None] * 255
+                    vector_colors[:, n] = np.repeat(color, T, axis=0)
+            else:
+                # color changes with time
+                for t in range(T):
+                    color = np.array(self.color_map(t / T)[:3])[None] * 255
+                    vector_colors[t] = np.repeat(color, N, axis=0)
+        else:
+            if self.mode == "rainbow":
+                vector_colors[:, segm_mask <= 0, :] = 255
+
+                y_min, y_max = (
+                    tracks[0, segm_mask > 0, 1].min(),
+                    tracks[0, segm_mask > 0, 1].max(),
+                )
+                norm = plt.Normalize(y_min, y_max)
+                for n in range(N):
+                    if segm_mask[n] > 0:
+                        color = self.color_map(norm(tracks[0, n, 1]))
+                        color = np.array(color[:3])[None] * 255
+                        vector_colors[:, n] = np.repeat(color, T, axis=0)
+
+            else:
+                # color changes with segm class
+                segm_mask = segm_mask.cpu()
+                color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32)
+                color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0
+                color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0
+                vector_colors = np.repeat(color[None], T, axis=0)
+
+        #  draw tracks
+        if self.tracks_leave_trace != 0:
+            for t in range(query_frame + 1, T):
+                first_ind = max(0, t - self.tracks_leave_trace) if self.tracks_leave_trace >= 0 else 0
+                curr_tracks = tracks[first_ind : t + 1]
+                curr_colors = vector_colors[first_ind : t + 1]
+                if compensate_for_camera_motion:
+                    diff = (tracks[first_ind : t + 1, segm_mask <= 0] - tracks[t : t + 1, segm_mask <= 0]).mean(1)[
+                        :, None
+                    ]
+
+                    curr_tracks = curr_tracks - diff
+                    curr_tracks = curr_tracks[:, segm_mask > 0]
+                    curr_colors = curr_colors[:, segm_mask > 0]
+
+                res_video[t] = self._draw_pred_tracks(
+                    res_video[t],
+                    curr_tracks,
+                    curr_colors,
+                )
+                if gt_tracks is not None:
+                    res_video[t] = self._draw_gt_tracks(res_video[t], gt_tracks[first_ind : t + 1])
+
+        #  draw points
+        for t in range(query_frame, T):
+            img = Image.fromarray(np.uint8(res_video[t]))
+            for i in range(N):
+                coord = (tracks[t, i, 0], tracks[t, i, 1])
+                visibile = True
+                if visibility is not None:
+                    visibile = visibility[0, t, i]
+                if coord[0] != 0 and coord[1] != 0:
+                    if not compensate_for_camera_motion or (compensate_for_camera_motion and segm_mask[i] > 0):
+                        img = draw_circle(
+                            img,
+                            coord=coord,
+                            radius=int(self.linewidth * 2),
+                            color=vector_colors[t, i].astype(int),
+                            visible=visibile,
+                        )
+            res_video[t] = np.array(img)
+
+        #  construct the final rgb sequence
+        if self.show_first_frame > 0:
+            res_video = [res_video[0]] * self.show_first_frame + res_video[1:]
+        return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte()
+
+    def _draw_pred_tracks(
+        self,
+        rgb: np.ndarray,  # H x W x 3
+        tracks: np.ndarray,  # T x 2
+        vector_colors: np.ndarray,
+        alpha: float = 0.5,
+    ):
+        T, N, _ = tracks.shape
+        rgb = Image.fromarray(np.uint8(rgb))
+        for s in range(T - 1):
+            vector_color = vector_colors[s]
+            original = rgb.copy()
+            alpha = (s / T) ** 2
+            for i in range(N):
+                coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1]))
+                coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1]))
+                if coord_y[0] != 0 and coord_y[1] != 0:
+                    rgb = draw_line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        vector_color[i].astype(int),
+                        self.linewidth,
+                    )
+            if self.tracks_leave_trace > 0:
+                rgb = Image.fromarray(np.uint8(add_weighted(np.array(rgb), alpha, np.array(original), 1 - alpha, 0)))
+        rgb = np.array(rgb)
+        return rgb
+
+    def _draw_gt_tracks(
+        self,
+        rgb: np.ndarray,  # H x W x 3,
+        gt_tracks: np.ndarray,  # T x 2
+    ):
+        T, N, _ = gt_tracks.shape
+        color = np.array((211, 0, 0))
+        rgb = Image.fromarray(np.uint8(rgb))
+        for t in range(T):
+            for i in range(N):
+                gt_tracks = gt_tracks[t][i]
+                #  draw a red cross
+                if gt_tracks[0] > 0 and gt_tracks[1] > 0:
+                    length = self.linewidth * 3
+                    coord_y = (int(gt_tracks[0]) + length, int(gt_tracks[1]) + length)
+                    coord_x = (int(gt_tracks[0]) - length, int(gt_tracks[1]) - length)
+                    rgb = draw_line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        color,
+                        self.linewidth,
+                    )
+                    coord_y = (int(gt_tracks[0]) - length, int(gt_tracks[1]) + length)
+                    coord_x = (int(gt_tracks[0]) + length, int(gt_tracks[1]) - length)
+                    rgb = draw_line(
+                        rgb,
+                        coord_y,
+                        coord_x,
+                        color,
+                        self.linewidth,
+                    )
+        rgb = np.array(rgb)
+        return rgb
diff --git a/tools/caption/camera_motion_detect.py b/tools/caption/camera_motion_detect.py
new file mode 100644
index 0000000..cc0077c
--- /dev/null
+++ b/tools/caption/camera_motion_detect.py
@@ -0,0 +1,132 @@
+# ref: https://github.com/antiboredom/camera-motion-detector
+
+import argparse
+
+import cv2
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+tqdm.pandas()
+
+
+def apply(df, func, **kwargs):
+    if pandas_has_parallel:
+        return df.parallel_apply(func, **kwargs)
+    return df.progress_apply(func, **kwargs)
+
+
+try:
+    from pandarallel import pandarallel
+
+    pandarallel.initialize(progress_bar=True)
+    pandas_has_parallel = True
+except ImportError:
+    pandas_has_parallel = False
+
+
+def make_empty(new_w, new_h):
+    empty = []
+    for y in range(new_h):
+        xvals = []
+        for x in range(new_w):
+            xvals.append([x, y])
+        empty.append(xvals)
+
+    empty = np.array(empty)
+    return empty
+
+
+def get_type(mag, ang, zoom_in, tau_static=1.0, tau_zoom=(0.4, 0.6)):
+    if mag < tau_static:
+        return "static"
+    if zoom_in < tau_zoom[0]:
+        return "zoom out"
+    if zoom_in > tau_zoom[1]:
+        return "zoom in"
+    if ang < 45 or ang >= 315:
+        return "pan left"
+    if 45 <= ang < 135:
+        return "tilt up"
+    if 135 <= ang < 225:
+        return "pan right"
+    if 225 <= ang < 315:
+        return "tilt down"
+    return "unknown"
+
+
+def get_video_type(frame_types):
+    # count the number of each type
+    counts = {}
+    max_count = 0
+    max_type = None
+    for frame_type in frame_types:
+        if frame_type not in counts:
+            counts[frame_type] = 0
+        counts[frame_type] += 1
+        if counts[frame_type] > max_count:
+            max_count = counts[frame_type]
+            max_type = frame_type
+    if max_count > len(frame_types) / 2:
+        return max_type
+    if "static" in counts:
+        return "unknown"
+    if "zoom in" not in counts and "zoom out" not in counts:
+        return "pan/tilt"
+    return "dynamic"
+
+
+def process(path: str, frame_interval=15) -> str:
+    cap = cv2.VideoCapture(path)
+    count = 0
+    prvs = None
+    frame_types = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            if count == 0:
+                prvs = frame
+                h, w = frame.shape
+                empty = make_empty(w, h)
+                empty_dists = np.sqrt(
+                    np.square(empty.ravel()[::2] - (w / 2)) + np.square(empty.ravel()[1::2] - (h / 2))
+                )
+            else:
+                flow = cv2.calcOpticalFlowFarneback(prvs, frame, None, 0.5, 3, 15, 3, 5, 1.2, 0)
+                mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True)
+                mean_mag = np.median(mag)
+                mean_ang = np.median(ang)
+
+                flow_coords = flow + empty
+                xvals = flow_coords.ravel()[::2] - (w / 2)
+                yvals = flow_coords.ravel()[1::2] - (h / 2)
+                dists = np.sqrt(np.square(xvals) + np.square(yvals))
+                dist_diff = dists >= empty_dists
+                zoom_in_factor = np.count_nonzero(dist_diff) / len(dist_diff)
+                frame_types.append(get_type(mean_mag, mean_ang, zoom_in_factor))
+            count += frame_interval
+            cap.set(cv2.CAP_PROP_POS_FRAMES, count)
+        else:
+            cap.release()
+            break
+    video_type = get_video_type(frame_types)
+    return video_type
+
+
+def main(args):
+    output_file = args.input.replace(".csv", "_cmotion.csv")
+    data = pd.read_csv(args.input)
+    data["cmotion"] = apply(data["path"], process)
+    data.to_csv(output_file, index=False)
+    print(f"Output saved to {output_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str)
+    parser.add_argument("--disable-parallel", action="store_true")
+    args = parser.parse_args()
+    if args.disable_parallel:
+        pandas_has_parallel = False
+    main(args)
diff --git a/tools/caption/caption_gpt4.py b/tools/caption/caption_gpt4.py
new file mode 100644
index 0000000..b896480
--- /dev/null
+++ b/tools/caption/caption_gpt4.py
@@ -0,0 +1,89 @@
+import argparse
+import base64
+import csv
+import os
+from io import BytesIO
+
+import tqdm
+from openai import OpenAI
+
+from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, VideoTextDataset
+
+client = OpenAI()
+
+
+def to_base64(image):
+    buffer = BytesIO()
+    image.save(buffer, format="JPEG")
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+
+def get_caption(frame, prompt):
+    response = client.chat.completions.create(
+        model="gpt-4o-2024-08-06",
+        messages=[
+            {"role": "system", "content": prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}", "detail": "low"}},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}", "detail": "low"}},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}", "detail": "low"}},
+                ],
+            },
+        ],
+        max_tokens=300,
+        top_p=0.1,
+    )
+    caption = response.choices[0].message.content
+    caption = caption.replace("\n", " ")
+
+    return caption
+
+
+def main(args):
+    # ======================================================
+    # 1. read video list
+    # ======================================================
+    dataset = VideoTextDataset(args.input, resize=360)
+    output_file = os.path.splitext(args.input)[0] + "_caption.csv"
+    f = open(output_file, "w")
+    writer = csv.writer(f)
+    writer.writerow(["video", "text"])
+
+    # make sure that the prompt type matches the data type
+    data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
+    prompt_type = PROMPTS[args.prompt]["type"]
+    if prompt_type == "image":
+        assert (
+            data_extension.lower() in IMG_EXTENSIONS
+        ), "The prompt is suitable for an image dataset but the data is not image."
+    elif prompt_type == "video":
+        assert (
+            data_extension.lower() in VID_EXTENSIONS
+        ), "The prompt is suitable for a video dataset but the data is not video."
+    else:
+        raise ValueError(f"Found invalid prompt type {prompt_type}")
+
+    # ======================================================
+    # 2. generate captions
+    # ======================================================
+    for sample in tqdm.tqdm(dataset):
+        prompt = PROMPTS[args.prompt]["text"]
+        if "text" in args.prompt:
+            prompt = prompt.format(sample["text"])
+        frames = sample["image"]
+        frames = [to_base64(frame) for frame in frames]
+        caption = get_caption(frames, prompt)
+
+        writer.writerow((sample["path"], caption))
+    f.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str, help="Path to the input CSV file")
+    parser.add_argument("--prompt", type=str, default="video-template")  # 1k/1h
+    args = parser.parse_args()
+
+    main(args)
diff --git a/tools/caption/caption_llama3.py b/tools/caption/caption_llama3.py
new file mode 100644
index 0000000..5337c0e
--- /dev/null
+++ b/tools/caption/caption_llama3.py
@@ -0,0 +1,292 @@
+import argparse
+import csv
+import os
+import warnings
+from datetime import timedelta
+
+import pandas as pd
+import torch
+import torch.distributed as dist
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from utils import read_file
+
+os.system(f"cp {__file__} ~/backup/")  # optionally backup the script
+warnings.filterwarnings("ignore")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from torch.distributed.elastic.multiprocessing.errors import record
+
+
+class CSVTextDataset(Dataset):
+    def __init__(self, csv_path):
+        self.df = pd.read_csv(csv_path)
+        # assert text is in the columns
+        assert "text" in self.df.columns, "text column not found in the csv file"
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, idx):
+        if idx < 0 or idx >= len(self.df):
+            raise IndexError
+        return self.df.iloc[idx]
+
+    def set_rank_and_world_size(self, rank, world_size):
+        self.rank = rank
+        self.world_size = world_size
+        self.data_per_gpu = len(self) // world_size
+        self.start_index = rank * self.data_per_gpu
+        self.end_index = (rank + 1) * self.data_per_gpu if rank != world_size - 1 else len(self)
+        self.df = self.df.iloc[self.start_index : self.end_index]
+
+    def write_to_csv(self, output_file, data, new_key):
+        """write the part of the df to a csv file corresponding to the rank and write self.data_list as a new column"""
+        writer = csv.writer(open(output_file, "w"))
+        columns = self.df.columns + [new_key]
+        writer.writerow(columns)
+        for index, row in self.df.iterrows():
+            if index < self.start_index or index >= self.end_index:
+                continue
+            writer.writerow([*row, data[index - self.start_index]])
+        writer.close()
+
+
+def pad_left(sequences, padding_value=0):
+    # Determine the maximum length of the sequences
+    max_len = max([s.size(0) for s in sequences])
+    # Create a list to hold the padded sequences
+    padded_sequences = []
+    for sequence in sequences:
+        # Calculate the number of padding elements needed for this sequence
+        num_padding = max_len - sequence.size(0)
+        # Create a tensor of padding values
+        padding = torch.full((num_padding,), padding_value, dtype=sequence.dtype).to(sequence.device)
+        # Concatenate the padding and the sequence to pad on the left
+        padded_sequence = torch.cat([padding, sequence], dim=0)
+        padded_sequences.append(padded_sequence)
+    # Stack the padded sequences into a batch
+    batch = torch.stack(padded_sequences)
+    return batch
+
+
+@record
+def main(args):
+    # ======================================================
+    # 1. init environment
+    # ======================================================
+    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
+    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+
+    # ======================================================
+    # 2. Prep rank-wise dataloader
+    # ======================================================
+    dataframe = read_file(args.input)
+    print("read data from {}".format(args.input))
+    dataset = CSVTextDataset(args.input)
+    dataset.set_rank_and_world_size(dist.get_rank(), dist.get_world_size())
+
+    import os
+
+    if os.getenv("DEBUG_ADDRESS") != None and dist.get_rank() == 2:
+        import ptvsd
+
+        print("waiting for debugger attachment")
+        ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # output_file = output_prefix + f"_rank{dist.get_rank()}" + f"_{args.key}.csv"
+    # 假设你已经有了 output_prefix 和 parts
+    output_prefix = os.path.splitext(args.input)[0]
+    parts = "parts"
+
+    # 找到最后一个 '/'
+    last_slash_index = output_prefix.rfind("/")
+
+    # 如果找到了 '/', 在其后插入 parts
+    if last_slash_index != -1:
+        output_prefix_total = output_prefix[: last_slash_index + 1] + "/" + output_prefix[last_slash_index + 1 :]
+        output_prefix = output_prefix[: last_slash_index + 1] + parts + "/" + output_prefix[last_slash_index + 1 :]
+
+    output_file = output_prefix + f"_rank{dist.get_rank()}" + f"_objects_actions.csv"
+    # 确保目录存在
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    output_file_handle = open(output_file, "w")
+    writer = csv.writer(output_file_handle)
+    # columns = list(dataframe.columns) + [args.key]
+    columns = list(dataframe.columns) + ["objects"] + ["actions"]
+
+    writer.writerow(columns)
+
+    # add a new key named summary, write in csv file
+    print("the processed data saved on this rank will be saved to {}".format(output_file))
+
+    def collate_fn(batch):
+        return batch
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        # num_workers=2,
+        batch_size=args.batch_size,
+        collate_fn=collate_fn,
+        shuffle=False,
+    )
+
+    # ======================================================
+    # 2. process using llama3 and prompt
+    # ======================================================
+
+    print("Using model with the id {}".format(args.model_id))
+    model_id = args.model_id
+    tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        device_map=dist.get_rank() % torch.cuda.device_count(),
+    )
+    # .to(dist.get_rank() % torch.cuda.device_count())
+    dist.barrier()
+    print("======== Process data using LLAMA3 ========")
+
+    def extract_batch(texts, prompt):
+        input_ids_list = [
+            tokenizer.apply_chat_template(
+                [{"role": "system", "content": prompt}, {"role": "user", "content": text}],
+                add_generation_prompt=True,
+                return_tensors="pt",
+            ).to(model.device)[0]
+            for text in texts
+        ]
+
+        attention_mask_list = [
+            torch.ones(input_ids.shape, dtype=torch.long, device=model.device) for input_ids in input_ids_list
+        ]
+
+        # input_ids_batch = pad_left(
+        #     input_ids_list, padding_value=tokenizer.eos_token_id
+        # )
+
+        input_ids_batch = torch.nn.utils.rnn.pad_sequence(
+            input_ids_list, batch_first=True, padding_value=tokenizer.eos_token_id
+        )
+
+        attention_mask_batch = torch.nn.utils.rnn.pad_sequence(attention_mask_list, batch_first=True, padding_value=0)
+
+        # attention_mask_batch = pad_left(
+        #     attention_mask_list, padding_value=0
+        # )
+
+        terminators = [
+            tokenizer.eos_token_id,
+            tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+        ]
+        outputs = model.generate(
+            input_ids_batch,
+            max_new_tokens=512,
+            attention_mask=attention_mask_batch,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=terminators,
+            # do_sample=True,
+            # temperature=0.6,
+            # top_p=0.9,
+        )
+
+        responses = []
+        for i in range(len(texts)):
+            response = outputs[i][input_ids_list[i].shape[-1] :]
+            response = tokenizer.decode(response, skip_special_tokens=True)
+            responses.append(response)
+
+        return responses
+
+    print("Processing starting...")
+    # prompt_objects = args.prompt_objects
+    # prompt_actions = args.prompt_actions
+    # if args.prompt_objects == "":
+    prompt_objects = (
+        "You are a AI assistant to extract objects from user's text. "
+        "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of objects separated by ',' and wrapped by '[' and ']': '[dog, person]' "
+    )
+    # if args.prompt_actions == "":
+    prompt_actions = (
+        "You are a AI assistant to extract actions from user's text. "
+        "For example: user: 'In this video a dog is running around. In addition, a person is laughing at the dog.', you produce a list of actions separated by ',' and wrapped by '[' and ']': '[run, laugh]' "
+    )
+
+    print("prompt_objects: {}".format(prompt_objects))
+    print("prompt_actions: {}".format(prompt_actions))
+
+    args.batch_size
+    # for i in tqdm(range(0, len(dataframe), batch_size)):
+    for _, batch in enumerate(tqdm(dataloader)):
+        # get the text column from the batch
+        texts = [batch[i]["text"] for i in range(len(batch))]
+        # Extract keywords using both prompts
+        list_actions = extract_batch(texts, prompt_actions)
+        list_objects = extract_batch(texts, prompt_objects)
+
+        for idx, (actions, objects) in enumerate(zip(list_actions, list_objects)):
+            try:
+                # Process actions
+                if actions.startswith("'") and actions.endswith("'"):
+                    actions = actions[1:-1]
+                actions_start = actions.find("[")
+                actions_end = actions.find("]")
+                actions = actions[actions_start + 1 : actions_end]
+                actions = [act.strip() for act in actions.split(",") if act.strip()]
+
+                # Process objects
+                if objects.startswith("'") and objects.endswith("'"):
+                    objects = objects[1:-1]
+                objects_start = objects.find("[")
+                objects_end = objects.find("]")
+                objects = objects[objects_start + 1 : objects_end]
+                objects = [obj.strip() for obj in objects.split(",") if obj.strip()]
+            except:
+                actions = "NONE_FOUND"
+                objects = "NONE_FOUND"
+
+        row = batch[idx]
+        writer.writerow([*row, objects, actions])  # Adjusted to include both objects and actions
+
+    output_file_handle.close()
+    dist.barrier()
+
+    if dist.get_rank() == 0:
+        # collated_file = output_prefix + f"_{args.key}.csv"
+        collated_file = output_prefix_total + f"_objects_actions.csv"
+        print("All ranks are finished. Collating the processed data to {}".format(collated_file))
+        import pandas as pd
+
+        csv_files = [output_prefix + f"_rank{i}" + f"_objects_actions.csv" for i in range(dist.get_world_size())]
+        # List to hold DataFrames
+        dataframes = []
+        # Read each CSV into a DataFrame and append to list
+        for file in csv_files:
+            df = pd.read_csv(file)
+            # scan each line in the df, if the ``key`` column is NaN, replace it with "NONE_FOUND"
+            # df[args.key] = df[args.key].fillna("NONE_FOUND")
+            df["objects"] = df["objects"].fillna("NONE_FOUND")
+            df["actions"] = df["actions"].fillna("NONE_FOUND")
+            dataframes.append(df)
+        # Concatenate all DataFrames
+        combined_df = pd.concat(dataframes, ignore_index=True)
+        # Save the combined DataFrame to a new CSV file
+        combined_df.to_csv(collated_file, index=False)
+        print("Collated data saved to {}".format(collated_file))
+    # terminate distributed env
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-id", default="meta-llama/Meta-Llama-3-8B-Instruct")
+    parser.add_argument("input", type=str, help="Path to the input CSV file")
+    # parser.add_argument("--output_prefix", type=str, help="Path to the output CSV file")
+    # parser.add_argument("--prompt_objects", type=str, default="")
+    # parser.add_argument("--prompt_actions", type=str, default="")
+    parser.add_argument("--batch_size", type=int, default=32)
+    # parser.add_argument("--key", type=str)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/tools/caption/caption_llava.py b/tools/caption/caption_llava.py
new file mode 100644
index 0000000..1674a1b
--- /dev/null
+++ b/tools/caption/caption_llava.py
@@ -0,0 +1,348 @@
+import argparse
+import csv
+import time
+import warnings
+from datetime import timedelta
+
+import torch
+import torch.distributed as dist
+from colossalai.cluster import DistCoordinator, ProcessGroupMesh
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.utils import get_current_device, set_seed
+from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+from llava.conversation import conv_templates
+from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+
+from ..datasets.utils import IMG_EXTENSIONS, VID_EXTENSIONS
+from .acceleration.llava.policies import LlavaLlamaForCausalLMPolicy, LlavaMistralForCausalLMPolicy
+from .utils import PROMPTS, Timer, VideoTextDataset, collate_fn
+
+disable_torch_init()
+import transformers
+
+transformers.logging.set_verbosity_error()
+
+
+class NoPaddingDistributedSampler(DistributedSampler):
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0, drop_last=False):
+        super().__init__(
+            dataset=dataset, num_replicas=num_replicas, rank=rank, seed=seed, shuffle=False, drop_last=False
+        )
+        remainder = len(self.dataset) % self.num_replicas
+        if remainder > 0 and (self.rank + 1) - remainder <= 0:
+            # if the dataset is not divisible by num_replicas
+            # the remaining items will be allocated to the first n ranks
+            self.num_samples = len(self.dataset) // self.num_replicas + 1
+        else:
+            self.num_samples = len(self.dataset) // self.num_replicas
+        self.total_size = len(dataset)
+
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+        else:
+            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+
+        # remove tail of data to make it evenly divisible.
+        indices = indices[: self.total_size]
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+
+@torch.inference_mode()
+def main(args):
+    # ======================================================
+    # 1. init environment
+    # ======================================================
+    # we set a very large timeout to avoid some processes exit early
+    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
+    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+    set_seed(1024)
+    coordinator = DistCoordinator()
+
+    assert (
+        args.dp_size * args.tp_size == coordinator.world_size
+    ), f"DP size {args.dp_size} * TP size {args.tp_size} must equal to world size {coordinator.world_size}"
+
+    mesh = ProcessGroupMesh(args.dp_size, args.tp_size)
+    dp_group = mesh.get_group_along_axis(0)
+    tp_group = mesh.get_group_along_axis(1)
+
+    # ======================================================
+    # 2. load model
+    # ======================================================
+    model_path = args.model_path
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            model_path=model_path,
+            model_base=None,
+            model_name=get_model_name_from_path(model_path),
+            device=get_current_device(),
+            torch_dtype=torch.float16,
+            attn_implementation="flash_attention_2" if args.flash_attention else "eager",
+        )
+        dist.barrier()
+
+    # ======================================================
+    # 3. Apply system optimization
+    # ======================================================
+    tp_size = dist.get_world_size(tp_group)
+    shard_config = ShardConfig(
+        tensor_parallel_process_group=tp_group if tp_size > 1 else None,
+        enable_tensor_parallelism=True if tp_size > 1 else False,
+    )
+    shard_former = ShardFormer(shard_config=shard_config)
+
+    # check the model type
+    model_name = model.__class__.__name__
+    print(model_name)
+    if model_name == "LlavaLlamaForCausalLM":
+        model = shard_former.optimize(model, policy=LlavaLlamaForCausalLMPolicy())[0].cuda()
+    elif model_name == "LlavaMistralForCausalLM":
+        model = shard_former.optimize(model, policy=LlavaMistralForCausalLMPolicy())[0].cuda()
+    else:
+        print(f"The shardformer policy for {model_name} is not implemented, skip")
+    torch.cuda.empty_cache()
+
+    # ======================================================
+    # 4. Prepare dataloader
+    # ======================================================
+    # prepare prompt
+    query = PROMPTS[args.prompt]["text"]
+    if dist.get_rank() == 0:
+        print(f"Prompt: {query}")
+
+    if "text" in args.prompt:
+
+        def get_text_input_ids(text):
+            conv = conv_templates["chatml_direct"].copy()
+            query_text = query.format(text)
+            conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query_text)
+            prompt = conv.get_prompt()
+            t = prompt.split("<image>")
+            prompt = t[0] + "<image>" * args.num_frames + t[1]
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+            input_ids = input_ids.unsqueeze(0)
+            return input_ids
+
+    else:
+        conv = conv_templates["chatml_direct"].copy()
+        conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query)
+        prompt = conv.get_prompt()
+        t = prompt.split("<image>")
+        prompt = t[0] + "<image>" * args.num_frames + t[1]
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+        input_ids = input_ids.unsqueeze(0)
+
+        def get_text_input_ids(*args):
+            return input_ids
+
+    # build dataset
+    def transform(imgs):
+        imgs = process_images(imgs, image_processor, model.config)
+        imgs = imgs.to(dtype=torch.float16)
+        return imgs
+
+    dataset = VideoTextDataset(
+        args.input,
+        transform=transform,
+        num_frames=args.num_frames,
+        get_text_input_ids=get_text_input_ids,
+        resize=args.resize,
+    )
+
+    # make sure that the prompt type matches the data type
+    data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
+    prompt_type = PROMPTS[args.prompt]["type"]
+    if prompt_type == "image":
+        assert (
+            data_extension.lower() in IMG_EXTENSIONS
+        ), f"The prompt is suitable for an image dataset but the data is not image."
+    elif prompt_type == "video":
+        assert (
+            data_extension.lower() in VID_EXTENSIONS
+        ), f"The prompt is suitable for a video dataset but the data is not video."
+    else:
+        raise ValueError(f"Found invalid prompt type {prompt_type}")
+
+    total_num_videos = len(dataset)
+
+    # build sampler
+    dp_rank = dist.get_rank(dp_group)
+    dp_size = dist.get_world_size(dp_group)
+    sampler = NoPaddingDistributedSampler(dataset, rank=dp_rank, num_replicas=dp_size)
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=args.bs,
+        shuffle=False,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        prefetch_factor=args.prefetch_factor,
+        sampler=sampler,
+        collate_fn=collate_fn,
+    )
+
+    with open(args.input, "r") as f:
+        reader = csv.DictReader(f)
+        original_data = [row for row in reader]
+    headers = reader.fieldnames
+
+    # prepare output file reader
+    output_file = args.input.replace(".csv", "_caption.csv")
+
+    # ======================================================
+    # 5. generate captions
+    # ======================================================
+    if dist.get_rank() == 0:
+        pbar = tqdm(dataloader, position=dp_rank, desc=f"Data Parallel Rank {dist.get_rank(dp_group)}")
+    else:
+        pbar = dataloader
+
+    if args.profile:
+        encode_time = []
+        generate_time = []
+        output_length = []
+        total_time = []
+
+    results = []
+    for i, batch in enumerate(pbar):
+        # measure time
+        if args.profile:
+            torch.cuda.synchronize()
+            start_time = time.time()
+
+        video_files, frames, video_lengths, img_size_list, texts = batch
+
+        # encode the batch of inputs
+        with Timer() as encode_timer:
+            samples = []
+            for imgs, imgs_size, input_ids in zip(frames, img_size_list, texts):
+                imgs = imgs.cuda()
+                input_ids = input_ids.cuda()
+                _, _, _, _, inputs_embeds, _ = model.prepare_inputs_labels_for_multimodal(
+                    input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size
+                )
+                samples.append(inputs_embeds)
+
+        # padding
+        max_len = max([sample.shape[1] for sample in samples])
+        attention_mask = torch.tensor(
+            [[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))]
+        ).to(model.device)
+        inputs_embeds = [
+            torch.cat(
+                [
+                    torch.zeros(
+                        (1, max_len - samples[i].shape[1], samples[i].shape[-1]),
+                        device=model.device,
+                        dtype=torch.float16,
+                    ),
+                    samples[i],
+                ],
+                dim=1,
+            )
+            for i in range(len(samples))
+        ]
+        inputs_embeds = torch.cat(inputs_embeds, dim=0)
+
+        with Timer() as generate_timer:
+            output_ids = super(type(model), model).generate(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                do_sample=False,
+                max_new_tokens=args.max_tokens,
+                use_cache=True,
+            )
+            # skip warmup and add profiling data
+            if args.profile and i >= args.profile_warmup:
+                output_length.append(output_ids.size(0) * output_ids.size(1))
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+            outputs = [output.replace("\n", " ").strip() for output in outputs]
+
+        # skip warmup and add profiling data
+        if args.profile and i >= args.profile_warmup:
+            # measure time
+            torch.cuda.synchronize()
+            time_taken = time.time() - start_time
+
+            total_time.append(time_taken)
+            encode_time.append(encode_timer.time_taken)
+            generate_time.append(generate_timer.time_taken)
+
+        for video_file, output_text, video_length in zip(video_files, outputs, video_lengths):
+            original_row = next(row for row in original_data if row["path"] == video_file)
+            original_row["text"] = output_text
+            original_row["num_frames"] = video_length
+            results.append(original_row)
+
+    # display profiling info
+    if args.profile:
+        print(output_length)
+        num_samples_after_warmup = total_num_videos - args.bs * args.profile_warmup * dp_size
+        print(f"throughput (samples/s): {num_samples_after_warmup / sum(total_time)}")
+        print(f"average encode time per sample: {sum(encode_time) / num_samples_after_warmup}")
+        print(f"average generate time per sample: {sum(generate_time) / num_samples_after_warmup}")
+        print(f"average number of tokens characters per sample: {sum(output_length) / num_samples_after_warmup}")
+        print(f"Max GPU allocated / GB: {torch.cuda.max_memory_allocated() / 1024**3}")
+        print(f"Max GPU reserved / GB: {torch.cuda.max_memory_reserved() / 1024**3}")
+
+    if dist.get_rank() == 0:
+        all_results = [None] * dist.get_world_size()
+    else:
+        all_results = None
+    dist.gather_object(results, all_results, dst=0)
+
+    if dist.get_rank() == 0:
+        all_results = [item for sublist in all_results if sublist is not None for item in sublist]
+
+        with open(output_file, "w", newline="") as f:
+            if "num_frames" not in headers:
+                writer = csv.DictWriter(f, fieldnames=headers + ["text", "num_frames"])
+            else:
+                writer = csv.DictWriter(f, fieldnames=headers + ["text"])
+            writer.writeheader()
+            writer.writerows(all_results)
+
+        print(f"Results saved to {output_file}")
+
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str, help="Path to the input CSV file")
+    parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-34b")
+    parser.add_argument("--prompt", type=str, default="video-f1-detail-3ex")
+    parser.add_argument("--resize", type=int, default=336)
+    parser.add_argument("--num-frames", type=int, default=1)
+    parser.add_argument("--max-tokens", type=int, default=300)
+    parser.add_argument("--bs", type=int, default=16)
+    parser.add_argument("--tp-size", type=int, default=2)
+    parser.add_argument("--dp-size", type=int, default=4)
+    parser.add_argument("--num-workers", type=int, default=8)
+    parser.add_argument("--prefetch-factor", type=int, default=8, help="Prefetch factor")
+    parser.add_argument(
+        "--flash-attention",
+        action="store_true",
+        help="Whether to use flash attention. You can turn on this flag for llama model and off for mistral model.",
+    )
+    # debug related
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--profile-warmup", type=int, default=1)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/caption/caption_llava_next.py b/tools/caption/caption_llava_next.py
new file mode 100644
index 0000000..2d1c727
--- /dev/null
+++ b/tools/caption/caption_llava_next.py
@@ -0,0 +1,379 @@
+# code modified based on https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/playground/demo/video_demo.py
+
+import argparse
+import base64
+import csv
+import math
+import os
+import warnings
+
+import cv2
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from decord import VideoReader, cpu
+from llava.constants import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+from llava.conversation import conv_templates
+from llava.mm_utils import get_model_name_from_path, tokenizer_image_token
+from llava.model.builder import load_pretrained_model
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoConfig, BitsAndBytesConfig
+
+warnings.filterwarnings("ignore")
+
+PAD_TOKEN_ID = 151643
+
+
+# Function to initialize the distributed environment
+def setup(rank, world_size):
+    print(f"Setting up process {rank} of {world_size}")
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.cuda.set_device(rank)
+    # Initialize the process group for communication
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+
+
+# Cleanup after inference is done
+def cleanup():
+    dist.destroy_process_group()
+
+
+class VideoDataset:
+    def __init__(self, df, args, rank, world_size, image_processor, model, tokenizer):
+        self.df = df
+        self.rank = rank
+        self.world_size = world_size
+        self.args = args
+        self.image_processor = image_processor
+        self.model = model
+        self.tokenizer = tokenizer
+        self.length = len([i for i in range(self.rank, len(self.df), self.world_size)])
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        row = self.df.iloc[self.rank + idx * self.world_size]
+        video_path = row["path"]
+        info = row
+        sample_set = {}
+        question = self.args.prompt
+        sample_set["video_name"] = video_path
+
+        if os.path.exists(video_path):
+            video, frame_time, video_time = load_video(video_path, self.args)
+            video = self.image_processor.preprocess(video, return_tensors="pt")["pixel_values"].half()
+            sample_set["video"] = video
+            sample_set["frame_time"] = frame_time
+            sample_set["video_time"] = video_time
+            sample_set["info"] = info
+
+            if self.args.add_time_instruction:
+                time_instruction = (
+                    f"The video lasts for {video_time:.2f} seconds, and "
+                    f"{self.args.for_get_frames_num} frames are uniformly sampled from it. "
+                    f"These frames are located at {frame_time}. "
+                    f"Please answer the following questions related to this video."
+                )
+                qs = f"{time_instruction}\n{question}"
+            else:
+                qs = question
+            if self.model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+
+            conv = conv_templates[self.args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(
+                0
+            )
+            # print("input_ids", input_ids)
+            sample_set["input_ids"] = input_ids
+            attention_masks = input_ids.ne(self.tokenizer.pad_token_id).long()
+            sample_set["attention_masks"] = attention_masks
+
+        return sample_set
+
+
+def collate_fn(batch):
+    # Collate function to handle dynamic padding or combining elements of the batch
+    videos = [item["video"] for item in batch if "video" in item]
+    input_ids = [item["input_ids"] for item in batch]
+    max_len = max([item.shape[1] for item in input_ids])
+    # pad token id PAD_TOKEN_ID
+    input_ids = [torch.nn.functional.pad(item, (max_len - item.shape[1], 0), value=PAD_TOKEN_ID) for item in input_ids]
+    input_ids = torch.cat(input_ids, dim=0)
+    attention_masks = [item["attention_masks"] for item in batch]
+    attention_masks = [torch.nn.functional.pad(item, (max_len - item.shape[1], 0), value=0) for item in attention_masks]
+    attention_masks = torch.cat(attention_masks, dim=0)
+
+    video_names = [item["video_name"] for item in batch]
+    frame_times = [item["frame_time"] for item in batch]
+    video_times = [item["video_time"] for item in batch]
+    infos = [item["info"] for item in batch]
+
+    return {
+        "input_ids": input_ids,
+        "attention_masks": attention_masks,
+        "videos": videos,
+        "video_names": video_names,
+        "frame_times": frame_times,
+        "video_times": video_times,
+        "infos": infos,
+    }
+
+
+def create_dataloader(df, args, rank, world_size, image_processor, model, tokenizer):
+    dataset = VideoDataset(df, args, rank, world_size, image_processor, model, tokenizer)
+    return DataLoader(
+        dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.num_workers
+    )
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def parse_args():
+    """
+    Parse command-line arguments.
+    """
+    parser = argparse.ArgumentParser()
+
+    # Define the command-line arguments
+    parser.add_argument("--data_file", help="Path to the video dataset file.", required=True)
+    parser.add_argument("--output_folder", help="Path to the output file.", required=True)
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--mm_resampler_type", type=str, default="spatial_pool")
+    parser.add_argument("--mm_spatial_pool_stride", type=int, default=4)
+    parser.add_argument("--mm_spatial_pool_out_channels", type=int, default=1024)
+    parser.add_argument("--mm_spatial_pool_mode", type=str, default="average")
+    parser.add_argument("--image_aspect_ratio", type=str, default="anyres")
+    parser.add_argument(
+        "--image_grid_pinpoints",
+        type=str,
+        default="[(224, 448), (224, 672), (224, 896), (448, 448), (448, 224), (672, 224), (896, 224)]",
+    )
+    parser.add_argument("--mm_patch_merge_type", type=str, default="spatial_unpad")
+    parser.add_argument("--overwrite", type=lambda x: (str(x).lower() == "true"), default=True)
+    parser.add_argument("--for_get_frames_num", type=int, default=4)
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--num_workers", type=int, default=10)
+    parser.add_argument("--load_8bit", type=lambda x: (str(x).lower() == "true"), default=False)
+    parser.add_argument("--prompt", type=str, default=None)
+    parser.add_argument("--api_key", type=str, help="OpenAI API key")
+    parser.add_argument("--mm_newline_position", type=str, default="no_token")
+    parser.add_argument("--force_sample", type=lambda x: (str(x).lower() == "true"), default=False)
+    parser.add_argument("--add_time_instruction", type=str, default=False)
+    return parser.parse_args()
+
+
+def load_video(video_path, args):
+    if args.for_get_frames_num == 0:
+        return np.zeros((1, 336, 336, 3))
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    total_frame_num = len(vr)
+    video_time = total_frame_num / vr.get_avg_fps()
+    fps = round(vr.get_avg_fps())
+    frame_idx = [i for i in range(0, len(vr), fps)]
+    frame_time = [i / fps for i in frame_idx]
+    if len(frame_idx) > args.for_get_frames_num or args.force_sample:
+        sample_fps = args.for_get_frames_num
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        frame_time = [i / vr.get_avg_fps() for i in frame_idx]
+    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    # import pdb;pdb.set_trace()
+
+    return spare_frames, frame_time, video_time
+
+
+def load_video_base64(path):
+    video = cv2.VideoCapture(path)
+
+    base64Frames = []
+    while video.isOpened():
+        success, frame = video.read()
+        if not success:
+            break
+        _, buffer = cv2.imencode(".jpg", frame)
+        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+
+    video.release()
+    # print(len(base64Frames), "frames read.")
+    return base64Frames
+
+
+def run_inference(rank, world_size, args):
+    """
+    Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
+
+    Args:
+        args: Command-line arguments.
+    """
+    setup(rank, world_size)
+
+    device = torch.device(f"cuda:{rank}")
+    # Initialize the model
+    model_name = get_model_name_from_path(args.model_path)
+    # Set model configuration parameters if they exist
+    if args.overwrite == True:
+        overwrite_config = {}
+        overwrite_config["mm_spatial_pool_mode"] = args.mm_spatial_pool_mode
+        overwrite_config["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride
+        overwrite_config["mm_newline_position"] = args.mm_newline_position
+
+        cfg_pretrained = AutoConfig.from_pretrained(args.model_path)
+
+        # import pdb;pdb.set_trace()
+        if "qwen" not in args.model_path.lower():
+            if "224" in cfg_pretrained.mm_vision_tower:
+                # suppose the length of text tokens is around 1000, from bo's report
+                least_token_number = args.for_get_frames_num * (16 // args.mm_spatial_pool_stride) ** 2 + 1000
+            else:
+                least_token_number = args.for_get_frames_num * (24 // args.mm_spatial_pool_stride) ** 2 + 1000
+
+            scaling_factor = math.ceil(least_token_number / 4096)
+            if scaling_factor >= 2:
+                if "vicuna" in cfg_pretrained._name_or_path.lower():
+                    print(float(scaling_factor))
+                    overwrite_config["rope_scaling"] = {"factor": float(scaling_factor), "type": "linear"}
+                overwrite_config["max_sequence_length"] = 4096 * scaling_factor
+                overwrite_config["tokenizer_model_max_length"] = 4096 * scaling_factor
+        if args.load_8bit:
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_compute_dtype=torch.bfloat16)
+            tokenizer, model, image_processor, context_len = load_pretrained_model(
+                args.model_path,
+                args.model_base,
+                model_name,
+                device_map=device,
+                quantization_config=quantization_config,
+                overwrite_config=overwrite_config,
+            )
+        else:
+            tokenizer, model, image_processor, context_len = load_pretrained_model(
+                args.model_path, args.model_base, model_name, device_map=device, overwrite_config=overwrite_config
+            )
+    else:
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            args.model_path, args.model_base, model_name, device_map=device
+        )
+
+    if tokenizer.pad_token_id is None:
+        if "qwen" in tokenizer.name_or_path.lower():
+            # print("Setting pad token to bos token for qwen model.")
+            tokenizer.pad_token_id = PAD_TOKEN_ID
+
+    if args.batch_size > 1:
+        tokenizer.padding_side = "left"
+        model.config.tokenizer_padding_side = "left"
+
+    # model = DDP(model, device_ids=[rank])
+
+    # import pdb;pdb.set_trace()
+    if getattr(model.config, "force_sample", None) is not None:
+        args.force_sample = model.config.force_sample
+    else:
+        args.force_sample = False
+
+    if getattr(model.config, "add_time_instruction", None) is not None:
+        args.add_time_instruction = model.config.add_time_instruction
+    else:
+        args.add_time_instruction = False
+
+    df = pd.read_csv(args.data_file)
+    data_name = os.path.basename(args.data_file).split(".csv")[0]
+    column_names = df.columns.to_list()
+    if "text" not in column_names:
+        column_names.append("text")
+    text_column_index = column_names.index("text")
+
+    output_file = os.path.join(args.output_folder, f"{data_name}_{rank}.csv")
+    with open(output_file, "w", newline="") as csvfile:
+        csvwriter = csv.writer(csvfile)
+        csvwriter.writerow(column_names)
+
+        dataloader = create_dataloader(df, args, rank, world_size, image_processor, model, tokenizer)
+
+        for batch in tqdm(dataloader):
+            videos = [item.to(device) for item in batch["videos"]]
+            input_ids = batch["input_ids"].to(device)
+            attention_masks = batch["attention_masks"].to(device)
+            infos = batch["infos"]
+            stop_str = "###"
+            with torch.inference_mode():
+                modalities = ["video"] * len(videos)
+                if "mistral" not in cfg_pretrained._name_or_path.lower():
+                    output_ids = model.generate(
+                        inputs=input_ids,
+                        images=videos,
+                        attention_mask=attention_masks,
+                        modalities=modalities,
+                        do_sample=False,
+                        temperature=0.0,
+                        max_new_tokens=1024,
+                        top_p=0.1,
+                        num_beams=1,
+                        use_cache=True,
+                    )
+                    # output_ids = model.generate(inputs=input_ids, images=video, attention_mask=attention_masks, modalities="video", do_sample=True, temperature=0.2, max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria])
+                else:
+                    output_ids = model.generate(
+                        inputs=input_ids,
+                        images=videos,
+                        attention_mask=attention_masks,
+                        modalities=modalities,
+                        do_sample=False,
+                        temperature=0.0,
+                        max_new_tokens=1024,
+                        top_p=0.1,
+                        num_beams=1,
+                        use_cache=True,
+                    )
+                    # output_ids = model.generate(inputs=input_ids, images=video, attention_mask=attention_masks, modalities="video", do_sample=True, temperature=0.2, max_new_tokens=1024, use_cache=True)
+            # print("output_ids", output_ids)
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+            # print("outputs", outputs)
+            outputs = [output.split(stop_str)[0].strip() for output in outputs]
+            if len(infos[0]) < len(column_names):
+                for i in range(len(infos)):
+                    infos[i].append(outputs[i])
+            else:
+                for i in range(len(infos)):
+                    infos[i][text_column_index] = outputs[i]
+
+            # write to csv
+            for row in infos:
+                csvwriter.writerow(row)
+
+    cleanup()
+
+
+if __name__ == "__main__":
+    world_size = torch.cuda.device_count()
+    print(f"Running inference on {world_size} GPUs.")
+    args = parse_args()
+
+    # Spawn one process per GPU
+    mp.spawn(run_inference, args=(world_size, args), nprocs=world_size, join=True)
diff --git a/tools/caption/caption_trans.py b/tools/caption/caption_trans.py
new file mode 100644
index 0000000..9126eb3
--- /dev/null
+++ b/tools/caption/caption_trans.py
@@ -0,0 +1,79 @@
+import argparse
+import base64
+import csv
+import os
+from io import BytesIO
+
+import tqdm
+from openai import OpenAI
+
+from .utils import IMG_EXTENSIONS, PROMPTS, VID_EXTENSIONS, VideoTextDataset
+
+client = OpenAI()
+
+
+def to_base64(image):
+    buffer = BytesIO()
+    image.save(buffer, format="JPEG")
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+
+def get_caption(text, prompt):
+    response = client.chat.completions.create(
+        model="gpt-4o-2024-08-06",
+        messages=[
+            {"role": "system", "content": prompt},
+            {"role": "user", "content": text},
+        ],
+        max_tokens=300,
+        top_p=0.1,
+    )
+    caption = response.choices[0].message.content
+    caption = caption.replace("\n", " ")
+
+    return caption
+
+
+def main(args):
+    # ======================================================
+    # 1. read video list
+    # ======================================================
+    dataset = VideoTextDataset(args.input, text_only=True)
+    output_file = os.path.splitext(args.input)[0] + "_captrans.csv"
+    f = open(output_file, "w")
+    writer = csv.writer(f)
+    writer.writerow(["video", "text"])
+
+    # make sure that the prompt type matches the data type
+    data_extension = "." + dataset.data["path"].iloc[0].split(".")[-1]
+    prompt_type = PROMPTS[args.prompt]["type"]
+    if prompt_type == "image":
+        assert (
+            data_extension.lower() in IMG_EXTENSIONS
+        ), "The prompt is suitable for an image dataset but the data is not image."
+    elif prompt_type == "video":
+        assert (
+            data_extension.lower() in VID_EXTENSIONS
+        ), "The prompt is suitable for a video dataset but the data is not video."
+    else:
+        raise ValueError(f"Found invalid prompt type {prompt_type}")
+
+    # ======================================================
+    # 2. generate captions
+    # ======================================================
+    for sample in tqdm.tqdm(dataset):
+        prompt = PROMPTS[args.prompt]["text"]
+        text = sample["text"]
+        caption = get_caption(text, prompt)
+
+        writer.writerow((sample["path"], caption))
+    f.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str, help="Path to the input CSV file")
+    parser.add_argument("--prompt", type=str, default="video-captrans-template")  # 1k/20min
+    args = parser.parse_args()
+
+    main(args)
diff --git a/tools/caption/pllava_dir/caption_pllava.py b/tools/caption/pllava_dir/caption_pllava.py
new file mode 100644
index 0000000..044ac80
--- /dev/null
+++ b/tools/caption/pllava_dir/caption_pllava.py
@@ -0,0 +1,407 @@
+import os
+import sys
+from pathlib import Path
+
+current_file = Path(__file__)  # Gets the path of the current file
+fourth_level_parent = current_file.parents[3]
+
+datasets_dir = os.path.join(fourth_level_parent, "opensora/datasets")
+import sys
+
+sys.path.append(datasets_dir)
+from read_video import read_video_av
+
+sys.path.remove(datasets_dir)
+
+import itertools
+import logging
+import random
+import traceback
+from argparse import ArgumentParser
+from multiprocessing import Process, Queue
+
+import colossalai
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torchvision
+import transformers
+from colossalai.utils import get_current_device
+from PIL import Image
+from tasks.eval.eval_utils import Conversation
+from tasks.eval.model_utils import load_pllava
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from torch.utils.data._utils.collate import default_collate
+from tqdm import tqdm
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--pretrained_model_name_or_path", type=str, required=True, default="llava-hf/llava-1.5-7b-hf")
+    parser.add_argument("--prompt_template", type=str, default="general", choices=["general", "person"])
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        required=False,
+        default=1,
+    )
+    parser.add_argument(
+        "--csv_path",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--num_frames",
+        type=int,
+        required=True,
+        default=4,
+    )
+    parser.add_argument("--use_lora", action="store_true")
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        required=False,
+        default=4,
+    )
+    parser.add_argument(
+        "--weight_dir",
+        type=str,
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        "--conv_mode",
+        type=str,
+        required=False,
+        default="eval_mvbench",
+    )
+    parser.add_argument(
+        "--pooling_shape",
+        type=str,
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        "--error_message",
+        type=str,
+        required=False,
+        default="error occured during captioning",
+    )
+    parser.add_argument("--keep_failed", action="store_true", default=False)
+    parser.add_argument(
+        "--short_caption_ratio",
+        type=float,
+        required=False,
+        default=0,
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+###############
+# data processing
+###############
+
+
+def get_index(num_frames, num_segments):
+    seg_size = float(num_frames - 1) / num_segments
+    start = int(seg_size / 2)
+    offsets = np.array([start + int(np.round(seg_size * idx)) for idx in range(num_segments)])
+    return offsets
+
+
+def load_video(video_path, num_frames, return_msg=False, resolution=336):
+    transforms = torchvision.transforms.Resize(size=resolution)
+    # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    vframes, aframes, info = read_video_av(video_path, pts_unit="sec", output_format="THWC")
+    # print(vframes.shape)
+    total_num_frames = len(vframes)
+    # print("Video path: ", video_path)
+    # print("Total number of frames: ", total_num_frames)
+    frame_indices = get_index(total_num_frames, num_frames)
+    images_group = list()
+    for frame_index in frame_indices:
+        img = Image.fromarray(vframes[frame_index].numpy())
+        images_group.append(transforms(img))
+    if return_msg:
+        # fps = float(vframes.get_avg_fps())
+        # sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
+        # # " " should be added in the start and end
+        # msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
+        # return images_group, msg
+        exit("return_msg not implemented yet")
+    else:
+        return images_group
+
+
+class CSVDataset(Dataset):
+    def __init__(self, csv_path, num_frames):
+        self.df = pd.read_csv(csv_path)
+        self.data_list = self.df.path.tolist()
+        self.num_frames = num_frames
+
+    def __len__(self):
+        return len(self.data_list)
+
+    def __getitem__(self, idx):
+        try:
+            video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
+            return video
+        except:
+            return None
+
+    @staticmethod
+    def collate_fn(batch):
+        batch = [item for item in batch if item is not None]
+        if len(batch) == 0:
+            return None, None
+
+        if random.random() <= SHORT_CAPTION_RATIO:
+            prompt = SHORT_PROMPT
+            max_tokens = MAX_SHORT_TOKENS
+        else:
+            prompt = LONG_PROMPT
+            max_tokens = MAX_LONG_TOKENS
+
+        processed_batch = [processor(text=prompt, images=video, return_tensors="pt") for video in batch]
+        batch = default_collate(processed_batch)
+
+        for k, v in batch.items():
+            if k in ("input_ids", "attention_mask"):
+                batch[k] = v.squeeze(1)
+            elif k == "pixel_values":
+                b, t, c, h, w = v.shape
+                batch[k] = v.reshape(b * t, c, h, w)
+        return batch, max_tokens
+
+    @staticmethod
+    def post_process(output_texts, processor):
+        output_texts = processor.batch_decode(
+            output_texts, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        if LONG_CONV_TEMPLATE.roles[-1] == "<|im_start|>assistant\n":
+            split_tag = "<|im_start|> assistant\n"
+        else:
+            split_tag = LONG_CONV_TEMPLATE.roles[-1]
+        ending = LONG_CONV_TEMPLATE.sep if isinstance(LONG_CONV_TEMPLATE.sep, str) else LONG_CONV_TEMPLATE.sep[1]
+        for i, output_text in enumerate(output_texts):
+            output_text = output_text.split(split_tag)[-1]
+            output_text = output_text.removesuffix(ending).strip()
+            output_text = output_text.replace("\n", " ")
+            output_texts[i] = output_text
+        return output_texts
+
+
+def load_model_and_dataset(
+    pretrained_model_name_or_path,
+    num_frames,
+    use_lora,
+    lora_alpha,
+    weight_dir,
+    csv_path,
+    pooling_shape=(16, 12, 12),
+):
+    # remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes.
+    model, processor = load_pllava(
+        pretrained_model_name_or_path,
+        num_frames=num_frames,
+        use_lora=use_lora,
+        weight_dir=weight_dir,
+        lora_alpha=lora_alpha,
+        pooling_shape=pooling_shape,
+    )
+
+    #  position embedding
+    model = model.to(device=get_current_device())
+    model = model.eval()
+
+    dataset = CSVDataset(csv_path, num_frames)
+    return model, processor, dataset
+
+
+def infer(
+    model,
+    batch,
+    max_tokens,
+):
+    batch = batch.to(get_current_device())
+    with torch.no_grad():
+        output_texts = model.generate(
+            **batch,
+            media_type="video",
+            do_sample=False,
+            max_new_tokens=max_tokens,
+            num_beams=1,
+            min_length=1,
+            repetition_penalty=1.0,
+            length_penalty=1,
+            temperature=1.0,
+        )
+    output_texts = [x.cpu() for x in output_texts]
+    return output_texts
+
+
+def inference_loop(args, model, dataset, q: Queue):
+    dataloader = DataLoader(
+        dataset,
+        num_workers=2,
+        batch_size=args.batch_size,
+        collate_fn=CSVDataset.collate_fn,
+        pin_memory=True,
+        sampler=DistributedSampler(dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False),
+    )
+
+    for i, (batch, max_tokens) in enumerate(tqdm(dataloader, disable=dist.get_rank() != 0)):
+        try:
+            if batch is None:
+                raise Exception("Video not loaded properly")
+            preds = infer(
+                model,
+                batch,
+                max_tokens=max_tokens,
+            )
+        except Exception as e:
+            logger.error(f"error at rank {dist.get_rank()} sample {i}: {str(e)}")
+            traceback.print_exception(e)
+            # preds = args.error_message duplicated for each video in the batch
+            preds = [args.error_message] * len(batch)
+        q.put(preds)
+    # finish the queue
+    q.put(None)
+
+
+def post_process_loop(processor, q: Queue, result_q: Queue):
+    results = []
+    while True:
+        preds = q.get()
+        if preds is not None:
+            preds = CSVDataset.post_process(preds, processor)
+            results.extend(preds)
+        else:
+            break
+    result_q.put(results)
+
+
+def main():
+    args = parse_args()
+    if args.prompt_template == "general":
+        long_pt = "Describe this video. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway."
+        short_pt = "Describe the video focusing on key objects and actions. The description should be brief yet detailed enough for AI to recreate the video. Keep the description to no more than three sentences. Here are some examples of good descriptions: 1. A stylish woman walks confidently down a neon-lit Tokyo street, wearing a black leather jacket and a long red dress, with pedestrians and reflective wet pavement around her. 2. Giant wooly mammoths tread through a snowy meadow, their fur blowing lightly in the wind, with snowy trees and mountains in the background. 3. A drone captures waves crashing against rugged cliffs along Big Sur, with golden sunset light illuminating the rocky shore and a lighthouse in the distance."
+    elif args.prompt_template == "person":
+        # pt = "Describe this video in detail. Pay special attention to all details of the person, including the face, the body, the pose, the action, and the outfit. Also pay attention to the camera angle. The description should be useful for AI to re-generate the video. The description should contain no more than six sentences."
+        long_pt = "Describe this video in detail. Pay special attention to all details of the person, including 1. apperance, such as hair, face, body, and outfit; 2. expression and emotion; 3. action and pose. Also pay attention to the background and the surrounding environment. Also pay attention to the camera angle. The description should be useful for AI to re-generate the video. The description should contain no more than six sentences."
+        short_pt = "Describe this video in detail. Pay special attention to key details of the person, including 1. apperance, such as hair, face, body, and outfit; 2. expression and emotion; 3. action and pose. Also pay attention to the background and the surrounding environment. Also pay attention to the camera angle. The description should be useful for AI to re-generate the video. The description should contain no more than three sentences."
+    else:
+        raise ValueError
+
+    assert (
+        args.short_caption_ratio >= 0 and args.short_caption_ratio <= 1
+    ), "`short_caption_ratio` should be in range [0, 1]"
+
+    global LONG_CONV_TEMPLATE
+    global SHORT_CONV_TEMPLATE
+    global LONG_PROMPT
+    global SHORT_PROMPT
+    global RESOLUTION
+    global SHORT_CAPTION_RATIO
+    global MAX_LONG_TOKENS
+    global MAX_SHORT_TOKENS
+
+    LONG_CONV_TEMPLATE = Conversation(
+        system=long_pt,
+        roles=("USER:", "ASSISTANT:"),
+        messages=[],
+        sep=(" ", "</s>"),
+        mm_token="<image>",
+    )
+    LONG_CONV_TEMPLATE.user_query("Describe the video in detail.", is_mm=True)
+    LONG_PROMPT = LONG_CONV_TEMPLATE.get_prompt()
+
+    SHORT_CONV_TEMPLATE = Conversation(
+        system=short_pt,
+        roles=("USER:", "ASSISTANT:"),
+        messages=[],
+        sep=(" ", "</s>"),
+        mm_token="<image>",
+    )
+    SHORT_CONV_TEMPLATE.user_query("Describe the video in detail.", is_mm=True)
+    SHORT_PROMPT = SHORT_CONV_TEMPLATE.get_prompt()
+
+    RESOLUTION = 672
+    SHORT_CAPTION_RATIO = args.short_caption_ratio
+    MAX_LONG_TOKENS = 256
+    MAX_SHORT_TOKENS = 128
+
+    colossalai.launch_from_torch()
+    rank = dist.get_rank()
+
+    # setup debug
+    if rank == 0:
+        import os
+
+        if os.getenv("DEBUG_ADDRESS") != None:
+            import ptvsd
+
+            ptvsd.enable_attach(address=("localhost", int(os.getenv("DEBUG_ADDRESS"))), redirect_output=True)
+            ptvsd.wait_for_attach()
+            print("waiting for debugger attachment")
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        logger.setLevel(transformers.logging.ERROR)
+
+    # setup model and dataset
+    if args.pooling_shape is not None:
+        pooling_shape = tuple([int(x) for x in args.pooling_shape.split("-")])
+
+    global processor
+    model, processor, dataset = load_model_and_dataset(
+        pretrained_model_name_or_path=args.pretrained_model_name_or_path,
+        num_frames=args.num_frames,
+        use_lora=args.use_lora,
+        lora_alpha=args.lora_alpha,
+        weight_dir=args.weight_dir,
+        pooling_shape=pooling_shape,
+        csv_path=args.csv_path,
+    )
+    logger.info(f"Dataset loaded with {len(dataset)} samples.")
+    q = Queue()
+    result_q = Queue()
+    p = Process(target=post_process_loop, args=(processor, q, result_q))
+    p.start()
+
+    inference_loop(args, model, dataset, q)
+    results = result_q.get()
+    p.join()
+
+    # gather results
+    results_list = [None for _ in range(dist.get_world_size())] if rank == 0 else None
+    dist.gather_object(results, results_list, dst=0)
+    if rank == 0:
+        # reorder and merge
+        final_results = list(itertools.chain.from_iterable(zip(*results_list)))
+        assert len(final_results) >= len(dataset)
+        # remove padding
+        final_results = final_results[: len(dataset)]
+
+        # write the results to the csv file
+        df = pd.read_csv(args.csv_path)
+        # add a new column to the dataframe
+        df["text"] = final_results
+        drop_failed = not args.keep_failed
+        if drop_failed:
+            df = df[df["text"] != args.error_message]
+            print(f"Dropped {len(dataset) - len(df)} samples")
+        # write the dataframe to a new csv file called '*_pllava_13b_caption.csv'
+        new_csv_path = args.csv_path.replace(".csv", "_text.csv")
+        df.to_csv(new_csv_path, index=False)
+        print(f"Results saved to {new_csv_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/caption/utils.py b/tools/caption/utils.py
new file mode 100644
index 0000000..882c259
--- /dev/null
+++ b/tools/caption/utils.py
@@ -0,0 +1,157 @@
+import time
+
+import pandas as pd
+import torch
+import torchvision.transforms as transforms
+from torchvision.datasets.folder import pil_loader
+
+from tools.datasets.utils import extract_frames, is_video
+
+IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
+VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
+PROMPTS = {
+    "image": {
+        "text": "Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than five sentences. Remember do not exceed 5 sentences.",
+        "type": "image",
+    },
+    "image-text": {
+        "text": "Describe this image and its style in a very detailed manner. Pay attention to all objects in the image. The description should be useful for AI to re-generate the image. The description should be no more than six sentences. Some information about the image is '{}'.",
+        "type": "image",
+    },
+    "image-3ex": {
+        "text": "An image is given. Describe this image and its style to generate a succinct yet informative description. Pay attention to all objects in the image. The description should be useful for AI to re-generate the video. The description should be no more than five sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick and walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
+        "type": "image",
+    },
+    "video": {
+        "text": "Describe this video and its style in a very detailed manner. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
+        "type": "video",
+    },
+    "video-template": {  # crafted by gpt-4o
+        "text": "Frames of a video clip are given. Write a concise yet detailed paragraph, following this sequence: 1. Subject: Describe the main subject in detail, including their appearance and what they are doing. 2. Subject Movement: Provide a detailed account of the subject’s movements or actions. 3. Scene Description: Describe the environment, setting, or location in as much detail as possible. 4. Atmosphere: Briefly describe the overall mood (e.g., lively, calm, cinematic). 5. Light and Shadow: Provide a brief description of the lighting (e.g., natural, artificial, morning light, sunset). 6. Camera Technique: Mention the camera angles and shots (e.g., close-up, wide shot) briefly.\nStart directly with descriptions, avoiding phrases like ‘the video’ or ‘the subject.’ Focus on providing detailed information for the first three points, while keeping the last three points more concise. Avoid any guesses or assumptions (e.g., avoid words like ‘possibly’ or ‘suggest’). Ensure the description is clear and based on observable details for a video-generation model to recreate the scene.",
+        "type": "video",
+    },
+    "video-captrans-template": {  # crafted by gpt-4o
+        "text": "A paragraph description of a video clip is given. Rewrite the description in a concise yet detailed manner, following this sequence: 1. Subject: Clearly describe the main subject, including their appearance and what they are doing. 2. Subject Movement: Provide a detailed account of the subject’s movements or actions. 3. Scene Description: Describe the environment or setting as thoroughly as possible. 4. Atmosphere: Briefly summarize the overall mood (e.g., lively, calm, cinematic). 5. Light and Shadow: Provide a brief note on the lighting (e.g., natural, artificial, morning light, sunset). 6. Camera Technique: Mention the camera angles and shots (e.g., close-up, wide shot) briefly.\nEnsure that the rewritten description only includes visible elements from the original paragraph and does not add any new details that were not mentioned. Keep the descriptions of the last three points simple. Avoid any guesses or assumptions (e.g., words like ‘possibly’ or ‘suggest’).",
+        "type": "video",
+    },
+    "video-text": {
+        "text": "Describe this video and its style in a very detailed manner. Some information about the image is '{}'. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences.",
+        "type": "video",
+    },
+    "video-f1-detail-3ex": {
+        "text": "A video is given by providing the middle frame. Describe this video and its style to generate a description. Pay attention to all objects in the video. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
+        "type": "video",
+    },
+    "video-f1-detail-2ex-text": {
+        "text": "A video is given by providing the middle frame. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
+        "type": "video",
+    },
+    "video-f3-detail-3ex": {
+        "text": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
+        "type": "video",
+    },
+    "video-f3-detail-2ex-text": {
+        "text": "A video is given by providing three frames in chronological order. Some information about the image is '{}'. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be no more than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.",
+        "type": "video",
+    },
+}
+
+
+NUM_FRAMES_POINTS = {
+    1: (0.5,),
+    2: (0.25, 0.75),
+    3: (0.1, 0.5, 0.9),
+}
+
+
+def read_file(input_path):
+    if input_path.endswith(".csv"):
+        return pd.read_csv(input_path)
+    elif input_path.endswith(".parquet"):
+        return pd.read_parquet(input_path)
+    else:
+        raise NotImplementedError(f"Unsupported file format: {input_path}")
+
+
+class VideoTextDataset(torch.utils.data.Dataset):
+    def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None, text_only=False):
+        self.csv_path = csv_path
+        self.transform = transform
+        self.data = read_file(csv_path)
+        self.points = NUM_FRAMES_POINTS[num_frames]
+        self.get_text_input_ids = get_text_input_ids
+        self.use_text = False
+        self.resize_size = resize
+        self.resize = transforms.Resize(resize, transforms.InterpolationMode.BICUBIC) if resize is not None else None
+        if "text" in self.data.columns:
+            self.use_text = True
+        self.text_only = text_only
+
+    def getitem(self, index):
+        sample = self.data.iloc[index]
+        path = sample["path"]
+
+        if not self.text_only:
+            if not is_video(path):
+                images = [pil_loader(path)]
+                length = 1
+            else:
+                images, length = extract_frames(
+                    sample["path"], points=self.points, backend="opencv", return_length=True
+                )
+            if self.resize_size is not None:
+                images_r = []
+                for img in images:
+                    if img.size[0] > self.resize_size or img.size[1] > self.resize_size:
+                        img = self.resize(img)
+                    images_r.append(img)
+                images = images_r
+            imgs_size = [img.size for img in images]
+            if self.transform is not None:
+                images = self.transform(images)
+
+            # we put images into a list as pytorch dataloader does not accept Pill
+            out = dict(path=path, image=images, length=length, img_size=imgs_size)
+        else:
+            out = dict(path=path)
+        if self.get_text_input_ids is not None:
+            if self.use_text:
+                out["text"] = self.get_text_input_ids(sample["text"])
+            else:
+                out["text"] = self.get_text_input_ids()
+        else:
+            if self.use_text:
+                out["text"] = sample["text"]
+            else:
+                out["text"] = ""
+        return out
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.getitem(index)
+
+
+def collate_fn(batch):
+    paths = [item["path"] for item in batch]
+    images = [item["image"] for item in batch]
+    lengths = [item["length"] for item in batch]
+    img_sizes = [item["img_size"] for item in batch]
+    texts = [item["text"] for item in batch]
+    return paths, images, lengths, img_sizes, texts
+
+
+class Timer:
+    def __init__(self):
+        self.time_taken = 0
+        self.start_time = 0
+        self.end_time = 0
+
+    def __enter__(self):
+        self.start_time = time.time()
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        self.end_time = time.time()
+        self.time_taken = self.end_time - self.start_time
diff --git a/tools/frame_interpolation/README.md b/tools/frame_interpolation/README.md
new file mode 100644
index 0000000..3886018
--- /dev/null
+++ b/tools/frame_interpolation/README.md
@@ -0,0 +1,44 @@
+# Frame Interpolation
+
+For current version, we sample 1 frame out of 3 frames in the video. Although we are going to use VAE to avoid frame loss, we provide a frame interpolation tool to interpolate the video now. The frame interpolation tool is based on [AMT](https://github.com/MCG-NKU/AMT).
+
+Interpolation can be useful for scenery videos, but it may not be suitable for videos with fast motion.
+
+## Requirement
+
+Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Frame Interpolation" sections.
+
+<!-- ```bash
+conda install -c conda-forge opencv
+pip install imageio
+``` -->
+
+## Model
+
+We use **AMT** as our frame interpolation model. After sampling, you can use frame interpolation model to interpolate your video smoothly.
+
+## Usage
+
+The ckpt file will be automatically downloaded in user's `.cache` directory. You can use frame interpolation to your video file or a video folder.
+
+1. Process a video file
+
+```python
+python -m tools.frame_interpolation.interpolation your_video.mp4
+```
+
+2. Process all video file in target directory
+
+```python
+python -m tools.frame_interpolation.interpolation your_video_dir --output_path samples/interpolation
+```
+
+The output video will be stored at `output_path` and its duration time is equal `the total number of frames after frame interpolation / the frame rate`
+
+### Command Line Arguments
+
+* `input`: Path of the input video. **Video path** or **Folder path(with --folder)**
+* `--ckpt`: Pretrained model of [AMT](https://github.com/MCG-NKU/AMT). Default path: `~/.cache/amt-g.pth`.
+* `--niter`: Iterations of interpolation. With $m$ input frames, `[N_ITER]` $=n$ corresponds to $2^n\times (m-1)+1$ output frames.
+* `--fps`: Frame rate of the input video. (Default: 8)
+* `--output_path`: **Folder Path** of the output video.
diff --git a/tools/frame_interpolation/__init__.py b/tools/frame_interpolation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/frame_interpolation/interpolation.py b/tools/frame_interpolation/interpolation.py
new file mode 100644
index 0000000..c9d8d1d
--- /dev/null
+++ b/tools/frame_interpolation/interpolation.py
@@ -0,0 +1,219 @@
+# this script is modified from https://github.com/MCG-NKU/AMT/blob/main/demos/demo_2x.py
+import argparse
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+import torch
+
+from opensora.utils.ckpt_utils import download_model
+
+from .networks.amt_g import Model
+from .utils.utils import InputPadder, img2tensor, tensor2img
+
+hf_endpoint = os.environ.get("HF_ENDPOINT")
+if hf_endpoint is None:
+    hf_endpoint = "https://huggingface.co"
+VID_EXT = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm"]
+network_cfg = {
+    "params": {
+        "corr_radius": 3,
+        "corr_lvls": 4,
+        "num_flows": 5,
+    },
+}
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def init():
+    """
+    initialize the device and the anchor resolution.
+    """
+
+    if device == "cuda":
+        anchor_resolution = 1024 * 512
+        anchor_memory = 1500 * 1024**2
+        anchor_memory_bias = 2500 * 1024**2
+        vram_avail = torch.cuda.get_device_properties(device).total_memory
+        print("VRAM available: {:.1f} MB".format(vram_avail / 1024**2))
+    else:
+        # Do not resize in cpu mode
+        anchor_resolution = 8192 * 8192
+        anchor_memory = 1
+        anchor_memory_bias = 0
+        vram_avail = 1
+
+    return anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail
+
+
+def get_input_video_from_path(input_path):
+    """
+    Get the input video from the input_path.
+
+    params:
+        input_path: str, the path of the input video.
+        devices: str, the device to run the model.
+    returns:
+        inputs: list, the list of the input frames.
+        scale: float, the scale of the input frames.
+        padder: InputPadder, the padder to pad the input frames.
+    """
+
+    anchor_resolution, anchor_memory, anchor_memory_bias, vram_avail = init()
+
+    if osp.splitext(input_path)[-1].lower() in VID_EXT:
+        vcap = cv2.VideoCapture(input_path)
+
+        inputs = []
+        w = int(vcap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        h = int(vcap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        scale = anchor_resolution / (h * w) * np.sqrt((vram_avail - anchor_memory_bias) / anchor_memory)
+        scale = 1 if scale > 1 else scale
+        scale = 1 / np.floor(1 / np.sqrt(scale) * 16) * 16
+        if scale < 1:
+            print(f"Due to the limited VRAM, the video will be scaled by {scale:.2f}")
+        padding = int(16 / scale)
+        padder = InputPadder((h, w), padding)
+        while True:
+            ret, frame = vcap.read()
+            if ret is False:
+                break
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame_t = img2tensor(frame).to(device)
+            frame_t = padder.pad(frame_t)
+            inputs.append(frame_t)
+        print(f"Loading the [video] from {input_path}, the number of frames [{len(inputs)}]")
+    else:
+        raise TypeError("Input should be a video.")
+
+    return inputs, scale, padder
+
+
+def load_model(ckpt):
+    """
+    load the frame interpolation model.
+    """
+    params = network_cfg.get("params", {})
+    model = Model(**params)
+    model.load_state_dict(ckpt["state_dict"])
+    model = model.to(device)
+    model.eval()
+    return model
+
+
+def interpolater(model, inputs, scale, padder, iters=1):
+    """
+    interpolating with the interpolation model.
+
+    params:
+        model: nn.Module, the frame interpolation model.
+        inputs: list, the list of the input frames.
+        scale: float, the scale of the input frames.
+        iters: int, the number of iterations of interpolation. The final frames model generating is 2 ** iters * (m - 1) + 1 and m is input frames.
+    returns:
+        outputs: list, the list of the output frames.
+    """
+
+    print("Start frame interpolation:")
+    embt = torch.tensor(1 / 2).float().view(1, 1, 1, 1).to(device)
+
+    for i in range(iters):
+        print(f"Iter {i+1}. input_frames={len(inputs)} output_frames={2*len(inputs)-1}")
+        outputs = [inputs[0]]
+        for in_0, in_1 in zip(inputs[:-1], inputs[1:]):
+            in_0 = in_0.to(device)
+            in_1 = in_1.to(device)
+            with torch.no_grad():
+                imgt_pred = model(in_0, in_1, embt, scale_factor=scale, eval=True)["imgt_pred"]
+            outputs += [imgt_pred.cpu(), in_1.cpu()]
+        inputs = outputs
+
+    outputs = padder.unpad(*outputs)
+    return outputs
+
+
+def write(outputs, input_path, output_path, fps=30):
+    """
+    write results to the output_path.
+    """
+
+    if osp.exists(output_path) is False:
+        os.makedirs(output_path)
+
+    size = outputs[0].shape[2:][::-1]
+
+    _, file_name_with_extension = os.path.split(input_path)
+    file_name, _ = os.path.splitext(file_name_with_extension)
+
+    save_video_path = f"{output_path}/fps{fps}_{file_name}.mp4"
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(save_video_path, fourcc, fps, size)
+
+    for i, imgt_pred in enumerate(outputs):
+        imgt_pred = tensor2img(imgt_pred)
+        imgt_pred = cv2.cvtColor(imgt_pred, cv2.COLOR_RGB2BGR)
+        writer.write(imgt_pred)
+    print(f"Demo video is saved to [{save_video_path}]")
+
+    writer.release()
+
+
+def process(
+    model,
+    image_path,
+    output_path,
+    fps,
+    iters,
+):
+    inputs, scale, padder = get_input_video_from_path(image_path)
+    outputs = interpolater(model, inputs, scale, padder, iters)
+    write(outputs, image_path, output_path, fps)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", help="Input video.")
+    parser.add_argument("--ckpt", type=str, default="./pretrained_models/amt-g.pth", help="The pretrained model.")
+    parser.add_argument(
+        "--niters",
+        type=int,
+        default=1,
+        help="Iter of Interpolation. The number of frames will be double after per iter.",
+    )
+    parser.add_argument("--output_path", type=str, default="samples", help="Output path.")
+    parser.add_argument("--fps", type=int, default=8, help="Frames rate of the output video.")
+    parser.add_argument("--folder", action="store_true", help="If the input is a folder, set this flag.")
+    args = parser.parse_args()
+
+    times_frame = 2**args.niters
+    old_fps = args.fps
+    args.fps = args.fps * times_frame
+    print(f"Interpolation will turn {old_fps}fps video to {args.fps}fps video.")
+    args.input = os.path.expanduser(args.input)
+    args.ckpt = os.path.expanduser(args.ckpt)
+    args.folder = osp.splitext(args.input)[-1].lower() not in VID_EXT
+    args.ckpt = download_model(local_path=args.ckpt, url=hf_endpoint + "/lalala125/AMT/resolve/main/amt-g.pth")
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt
+    input_path = args.input
+    output_path = args.output_path
+    iters = int(args.niters)
+    fps = int(args.fps)
+
+    model = load_model(ckpt_path)
+
+    if args.folder:
+        for file in os.listdir(input_path):
+            if osp.splitext(file)[-1].lower() in VID_EXT:
+                vid_path = os.path.join(input_path, file)
+                process(model, vid_path, output_path, fps, iters)
+    else:
+        process(model, input_path, output_path, fps, iters)
+
+    print("Interpolation is done.")
+    print(f"Output path: {output_path}")
diff --git a/tools/frame_interpolation/networks/__init__.py b/tools/frame_interpolation/networks/__init__.py
new file mode 100644
index 0000000..4db0516
--- /dev/null
+++ b/tools/frame_interpolation/networks/__init__.py
@@ -0,0 +1 @@
+from .amt_g import Model
diff --git a/tools/frame_interpolation/networks/amt_g.py b/tools/frame_interpolation/networks/amt_g.py
new file mode 100644
index 0000000..84b28cb
--- /dev/null
+++ b/tools/frame_interpolation/networks/amt_g.py
@@ -0,0 +1,156 @@
+import torch
+import torch.nn as nn
+
+from .blocks.feat_enc import LargeEncoder
+from .blocks.ifrnet import Encoder, InitDecoder, IntermediateDecoder, resize
+from .blocks.multi_flow import MultiFlowDecoder, multi_flow_combine
+from .blocks.raft import BasicUpdateBlock, BidirCorrBlock, coords_grid
+
+
+class Model(nn.Module):
+    def __init__(self, corr_radius=3, corr_lvls=4, num_flows=5, channels=[84, 96, 112, 128], skip_channels=84):
+        super(Model, self).__init__()
+        self.radius = corr_radius
+        self.corr_levels = corr_lvls
+        self.num_flows = num_flows
+
+        self.feat_encoder = LargeEncoder(output_dim=128, norm_fn="instance", dropout=0.0)
+        self.encoder = Encoder(channels, large=True)
+        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
+        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
+        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
+        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)
+
+        self.update4 = self._get_updateblock(112, None)
+        self.update3_low = self._get_updateblock(96, 2.0)
+        self.update2_low = self._get_updateblock(84, 4.0)
+
+        self.update3_high = self._get_updateblock(96, None)
+        self.update2_high = self._get_updateblock(84, None)
+
+        self.comb_block = nn.Sequential(
+            nn.Conv2d(3 * self.num_flows, 6 * self.num_flows, 7, 1, 3),
+            nn.PReLU(6 * self.num_flows),
+            nn.Conv2d(6 * self.num_flows, 3, 7, 1, 3),
+        )
+
+    def _get_updateblock(self, cdim, scale_factor=None):
+        return BasicUpdateBlock(
+            cdim=cdim,
+            hidden_dim=192,
+            flow_dim=64,
+            corr_dim=256,
+            corr_dim2=192,
+            fc_dim=188,
+            scale_factor=scale_factor,
+            corr_levels=self.corr_levels,
+            radius=self.radius,
+        )
+
+    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
+        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
+        # based on linear assumption
+        t1_scale = 1.0 / embt
+        t0_scale = 1.0 / (1.0 - embt)
+        if downsample != 1:
+            inv = 1 / downsample
+            flow0 = inv * resize(flow0, scale_factor=inv)
+            flow1 = inv * resize(flow1, scale_factor=inv)
+
+        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale)
+        corr = torch.cat([corr0, corr1], dim=1)
+        flow = torch.cat([flow0, flow1], dim=1)
+        return corr, flow
+
+    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
+        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
+        img0 = img0 - mean_
+        img1 = img1 - mean_
+        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
+        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
+        b, _, h, w = img0_.shape
+        coord = coords_grid(b, h // 8, w // 8, img0.device)
+
+        fmap0, fmap1 = self.feat_encoder([img0_, img1_])  # [1, 128, H//8, W//8]
+        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)
+
+        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
+        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
+        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
+        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)
+
+        ######################################### the 4th decoder #########################################
+        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
+        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord, up_flow0_4, up_flow1_4, embt, downsample=1)
+
+        # residue update with lookup corr
+        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
+        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
+        up_flow0_4 = up_flow0_4 + delta_flow0_4
+        up_flow1_4 = up_flow1_4 + delta_flow1_4
+        ft_3_ = ft_3_ + delta_ft_3_
+
+        ######################################### the 3rd decoder #########################################
+        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
+        corr_3, flow_3 = self._corr_scale_lookup(corr_fn, coord, up_flow0_3, up_flow1_3, embt, downsample=2)
+
+        # residue update with lookup corr
+        delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3)
+        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
+        up_flow0_3 = up_flow0_3 + delta_flow0_3
+        up_flow1_3 = up_flow1_3 + delta_flow1_3
+        ft_2_ = ft_2_ + delta_ft_2_
+
+        # residue update with lookup corr (hr)
+        corr_3 = resize(corr_3, scale_factor=2.0)
+        up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1)
+        delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3)
+        ft_2_ += delta_ft_2_
+        up_flow0_3 += delta_up_flow_3[:, 0:2]
+        up_flow1_3 += delta_up_flow_3[:, 2:4]
+
+        ######################################### the 2nd decoder #########################################
+        up_flow0_2, up_flow1_2, ft_1_ = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
+        corr_2, flow_2 = self._corr_scale_lookup(corr_fn, coord, up_flow0_2, up_flow1_2, embt, downsample=4)
+
+        # residue update with lookup corr
+        delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2)
+        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
+        up_flow0_2 = up_flow0_2 + delta_flow0_2
+        up_flow1_2 = up_flow1_2 + delta_flow1_2
+        ft_1_ = ft_1_ + delta_ft_1_
+
+        # residue update with lookup corr (hr)
+        corr_2 = resize(corr_2, scale_factor=4.0)
+        up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1)
+        delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2)
+        ft_1_ += delta_ft_1_
+        up_flow0_2 += delta_up_flow_2[:, 0:2]
+        up_flow1_2 += delta_up_flow_2[:, 2:4]
+
+        ######################################### the 1st decoder #########################################
+        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
+
+        if scale_factor != 1.0:
+            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
+            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0 / scale_factor)) * (1.0 / scale_factor)
+            mask = resize(mask, scale_factor=(1.0 / scale_factor))
+            img_res = resize(img_res, scale_factor=(1.0 / scale_factor))
+
+        # Merge multiple predictions
+        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1, mask, img_res, mean_)
+        imgt_pred = torch.clamp(imgt_pred, 0, 1)
+
+        if eval:
+            return {
+                "imgt_pred": imgt_pred,
+            }
+        else:
+            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
+            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
+            return {
+                "imgt_pred": imgt_pred,
+                "flow0_pred": [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
+                "flow1_pred": [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
+                "ft_pred": [ft_1_, ft_2_, ft_3_],
+            }
diff --git a/tools/frame_interpolation/networks/blocks/__init__.py b/tools/frame_interpolation/networks/blocks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/frame_interpolation/networks/blocks/feat_enc.py b/tools/frame_interpolation/networks/blocks/feat_enc.py
new file mode 100644
index 0000000..4798338
--- /dev/null
+++ b/tools/frame_interpolation/networks/blocks/feat_enc.py
@@ -0,0 +1,335 @@
+import torch
+import torch.nn as nn
+
+
+class BottleneckBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride)
+        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
+            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes // 4)
+            self.norm2 = nn.BatchNorm2d(planes // 4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes // 4)
+            self.norm2 = nn.InstanceNorm2d(planes // 4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class SmallEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
+        super(SmallEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
+
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(32)
+
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(32)
+
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 32
+        self.layer1 = self._make_layer(32, stride=1)
+        self.layer2 = self._make_layer(64, stride=2)
+        self.layer3 = self._make_layer(96, stride=2)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
+
+
+class BasicEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(64)
+
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(64)
+
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(72, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
+
+
+class LargeEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
+        super(LargeEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(64)
+
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(64)
+
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(112, stride=2)
+        self.layer3 = self._make_layer(160, stride=2)
+        self.layer3_2 = self._make_layer(160, stride=1)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer3_2(x)
+
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
diff --git a/tools/frame_interpolation/networks/blocks/ifrnet.py b/tools/frame_interpolation/networks/blocks/ifrnet.py
new file mode 100644
index 0000000..5719a04
--- /dev/null
+++ b/tools/frame_interpolation/networks/blocks/ifrnet.py
@@ -0,0 +1,115 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from tools.frame_interpolation.utils.flow_utils import warp
+
+
+def resize(x, scale_factor):
+    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)
+
+
+def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True):
+    return nn.Sequential(
+        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias),
+        nn.PReLU(out_channels),
+    )
+
+
+class ResBlock(nn.Module):
+    def __init__(self, in_channels, side_channels, bias=True):
+        super(ResBlock, self).__init__()
+        self.side_channels = side_channels
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
+            nn.PReLU(side_channels),
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias), nn.PReLU(in_channels)
+        )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
+            nn.PReLU(side_channels),
+        )
+        self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)
+        self.prelu = nn.PReLU(in_channels)
+
+    def forward(self, x):
+        out = self.conv1(x)
+
+        res_feat = out[:, : -self.side_channels, ...]
+        side_feat = out[:, -self.side_channels :, :, :]
+        side_feat = self.conv2(side_feat)
+        out = self.conv3(torch.cat([res_feat, side_feat], 1))
+
+        res_feat = out[:, : -self.side_channels, ...]
+        side_feat = out[:, -self.side_channels :, :, :]
+        side_feat = self.conv4(side_feat)
+        out = self.conv5(torch.cat([res_feat, side_feat], 1))
+
+        out = self.prelu(x + out)
+        return out
+
+
+class Encoder(nn.Module):
+    def __init__(self, channels, large=False):
+        super(Encoder, self).__init__()
+        self.channels = channels
+        prev_ch = 3
+        for idx, ch in enumerate(channels, 1):
+            k = 7 if large and idx == 1 else 3
+            p = 3 if k == 7 else 1
+            self.register_module(
+                f"pyramid{idx}", nn.Sequential(convrelu(prev_ch, ch, k, 2, p), convrelu(ch, ch, 3, 1, 1))
+            )
+            prev_ch = ch
+
+    def forward(self, in_x):
+        fs = []
+        for idx in range(len(self.channels)):
+            out_x = getattr(self, f"pyramid{idx+1}")(in_x)
+            fs.append(out_x)
+            in_x = out_x
+        return fs
+
+
+class InitDecoder(nn.Module):
+    def __init__(self, in_ch, out_ch, skip_ch) -> None:
+        super().__init__()
+        self.convblock = nn.Sequential(
+            convrelu(in_ch * 2 + 1, in_ch * 2),
+            ResBlock(in_ch * 2, skip_ch),
+            nn.ConvTranspose2d(in_ch * 2, out_ch + 4, 4, 2, 1, bias=True),
+        )
+
+    def forward(self, f0, f1, embt):
+        h, w = f0.shape[2:]
+        embt = embt.repeat(1, 1, h, w)
+        out = self.convblock(torch.cat([f0, f1, embt], 1))
+        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
+        ft_ = out[:, 4:, ...]
+        return flow0, flow1, ft_
+
+
+class IntermediateDecoder(nn.Module):
+    def __init__(self, in_ch, out_ch, skip_ch) -> None:
+        super().__init__()
+        self.convblock = nn.Sequential(
+            convrelu(in_ch * 3 + 4, in_ch * 3),
+            ResBlock(in_ch * 3, skip_ch),
+            nn.ConvTranspose2d(in_ch * 3, out_ch + 4, 4, 2, 1, bias=True),
+        )
+
+    def forward(self, ft_, f0, f1, flow0_in, flow1_in):
+        f0_warp = warp(f0, flow0_in)
+        f1_warp = warp(f1, flow1_in)
+        f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1)
+        out = self.convblock(f_in)
+        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
+        ft_ = out[:, 4:, ...]
+        flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0)
+        flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0)
+        return flow0, flow1, ft_
diff --git a/tools/frame_interpolation/networks/blocks/multi_flow.py b/tools/frame_interpolation/networks/blocks/multi_flow.py
new file mode 100644
index 0000000..cbb96a9
--- /dev/null
+++ b/tools/frame_interpolation/networks/blocks/multi_flow.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+
+from tools.frame_interpolation.utils.flow_utils import warp
+
+from .ifrnet import ResBlock, convrelu, resize
+
+
+def multi_flow_combine(comb_block, img0, img1, flow0, flow1, mask=None, img_res=None, mean=None):
+    """
+    A parallel implementation of multiple flow field warping
+    comb_block: An nn.Seqential object.
+    img shape: [b, c, h, w]
+    flow shape: [b, 2*num_flows, h, w]
+    mask (opt):
+        If 'mask' is None, the function conduct a simple average.
+    img_res (opt):
+        If 'img_res' is None, the function adds zero instead.
+    mean (opt):
+        If 'mean' is None, the function adds zero instead.
+    """
+    b, c, h, w = flow0.shape
+    num_flows = c // 2
+    flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
+    flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
+
+    mask = mask.reshape(b, num_flows, 1, h, w).reshape(-1, 1, h, w) if mask is not None else None
+    img_res = img_res.reshape(b, num_flows, 3, h, w).reshape(-1, 3, h, w) if img_res is not None else 0
+    img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w)
+    img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w)
+    mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1) if mean is not None else 0
+
+    img0_warp = warp(img0, flow0)
+    img1_warp = warp(img1, flow1)
+    img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res
+    img_warps = img_warps.reshape(b, num_flows, 3, h, w)
+    imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w))
+    return imgt_pred
+
+
+class MultiFlowDecoder(nn.Module):
+    def __init__(self, in_ch, skip_ch, num_flows=3):
+        super(MultiFlowDecoder, self).__init__()
+        self.num_flows = num_flows
+        self.convblock = nn.Sequential(
+            convrelu(in_ch * 3 + 4, in_ch * 3),
+            ResBlock(in_ch * 3, skip_ch),
+            nn.ConvTranspose2d(in_ch * 3, 8 * num_flows, 4, 2, 1, bias=True),
+        )
+
+    def forward(self, ft_, f0, f1, flow0, flow1):
+        n = self.num_flows
+        f0_warp = warp(f0, flow0)
+        f1_warp = warp(f1, flow1)
+        out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1))
+        delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2 * n, 2 * n, n, 3 * n], 1)
+        mask = torch.sigmoid(mask)
+
+        flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)
+        flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0).repeat(1, self.num_flows, 1, 1)
+
+        return flow0, flow1, mask, img_res
diff --git a/tools/frame_interpolation/networks/blocks/raft.py b/tools/frame_interpolation/networks/blocks/raft.py
new file mode 100644
index 0000000..1576889
--- /dev/null
+++ b/tools/frame_interpolation/networks/blocks/raft.py
@@ -0,0 +1,213 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(x, scale_factor):
+    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)
+
+
+def bilinear_sampler(img, coords, mask=False):
+    """Wrapper for grid_sample, uses pixel coordinates"""
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+
+    return img
+
+
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device), indexing="ij")
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+class SmallUpdateBlock(nn.Module):
+    def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim, corr_levels=4, radius=3, scale_factor=None):
+        super(SmallUpdateBlock, self).__init__()
+        cor_planes = corr_levels * (2 * radius + 1) ** 2
+        self.scale_factor = scale_factor
+
+        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
+        self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
+        self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
+        self.conv = nn.Conv2d(corr_dim + flow_dim, fc_dim, 3, padding=1)
+
+        self.gru = nn.Sequential(
+            nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+        )
+
+        self.feat_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
+        )
+
+        self.flow_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, 4, 3, padding=1),
+        )
+
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+
+    def forward(self, net, flow, corr):
+        net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
+        cor = self.lrelu(self.convc1(corr))
+        flo = self.lrelu(self.convf1(flow))
+        flo = self.lrelu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        inp = self.lrelu(self.conv(cor_flo))
+        inp = torch.cat([inp, flow, net], dim=1)
+
+        out = self.gru(inp)
+        delta_net = self.feat_head(out)
+        delta_flow = self.flow_head(out)
+
+        if self.scale_factor is not None:
+            delta_net = resize(delta_net, scale_factor=self.scale_factor)
+            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
+
+        return delta_net, delta_flow
+
+
+class BasicUpdateBlock(nn.Module):
+    def __init__(
+        self,
+        cdim,
+        hidden_dim,
+        flow_dim,
+        corr_dim,
+        corr_dim2,
+        fc_dim,
+        corr_levels=4,
+        radius=3,
+        scale_factor=None,
+        out_num=1,
+    ):
+        super(BasicUpdateBlock, self).__init__()
+        cor_planes = corr_levels * (2 * radius + 1) ** 2
+
+        self.scale_factor = scale_factor
+        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
+        self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1)
+        self.convf1 = nn.Conv2d(4, flow_dim * 2, 7, padding=3)
+        self.convf2 = nn.Conv2d(flow_dim * 2, flow_dim, 3, padding=1)
+        self.conv = nn.Conv2d(flow_dim + corr_dim2, fc_dim, 3, padding=1)
+
+        self.gru = nn.Sequential(
+            nn.Conv2d(fc_dim + 4 + cdim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+        )
+
+        self.feat_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
+        )
+
+        self.flow_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, 4 * out_num, 3, padding=1),
+        )
+
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+
+    def forward(self, net, flow, corr):
+        net = resize(net, 1 / self.scale_factor) if self.scale_factor is not None else net
+        cor = self.lrelu(self.convc1(corr))
+        cor = self.lrelu(self.convc2(cor))
+        flo = self.lrelu(self.convf1(flow))
+        flo = self.lrelu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        inp = self.lrelu(self.conv(cor_flo))
+        inp = torch.cat([inp, flow, net], dim=1)
+
+        out = self.gru(inp)
+        delta_net = self.feat_head(out)
+        delta_flow = self.flow_head(out)
+
+        if self.scale_factor is not None:
+            delta_net = resize(delta_net, scale_factor=self.scale_factor)
+            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
+        return delta_net, delta_flow
+
+
+class BidirCorrBlock:
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+        self.corr_pyramid = []
+        self.corr_pyramid_T = []
+
+        corr = BidirCorrBlock.corr(fmap1, fmap2)
+        batch, h1, w1, dim, h2, w2 = corr.shape
+        corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2)
+
+        corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
+        corr_T = corr_T.reshape(batch * h2 * w2, dim, h1, w1)
+
+        self.corr_pyramid.append(corr)
+        self.corr_pyramid_T.append(corr_T)
+
+        for _ in range(self.num_levels - 1):
+            corr = F.avg_pool2d(corr, 2, stride=2)
+            corr_T = F.avg_pool2d(corr_T, 2, stride=2)
+            self.corr_pyramid.append(corr)
+            self.corr_pyramid_T.append(corr_T)
+
+    def __call__(self, coords0, coords1):
+        r = self.radius
+        coords0 = coords0.permute(0, 2, 3, 1)
+        coords1 = coords1.permute(0, 2, 3, 1)
+        assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]"
+        batch, h1, w1, _ = coords0.shape
+
+        out_pyramid = []
+        out_pyramid_T = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            corr_T = self.corr_pyramid_T[i]
+
+            dx = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
+            dy = torch.linspace(-r, r, 2 * r + 1, device=coords0.device)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1)
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+
+            centroid_lvl_0 = coords0.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
+            centroid_lvl_1 = coords1.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
+            coords_lvl_0 = centroid_lvl_0 + delta_lvl
+            coords_lvl_1 = centroid_lvl_1 + delta_lvl
+
+            corr = bilinear_sampler(corr, coords_lvl_0)
+            corr_T = bilinear_sampler(corr_T, coords_lvl_1)
+            corr = corr.view(batch, h1, w1, -1)
+            corr_T = corr_T.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+            out_pyramid_T.append(corr_T)
+
+        out = torch.cat(out_pyramid, dim=-1)
+        out_T = torch.cat(out_pyramid_T, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float()
+
+    @staticmethod
+    def corr(fmap1, fmap2):
+        batch, dim, ht, wd = fmap1.shape
+        fmap1 = fmap1.view(batch, dim, ht * wd)
+        fmap2 = fmap2.view(batch, dim, ht * wd)
+
+        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
+        corr = corr.view(batch, ht, wd, 1, ht, wd)
+        return corr / torch.sqrt(torch.tensor(dim).float())
diff --git a/tools/frame_interpolation/utils/__init__.py b/tools/frame_interpolation/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/frame_interpolation/utils/dist_utils.py b/tools/frame_interpolation/utils/dist_utils.py
new file mode 100644
index 0000000..d754d4f
--- /dev/null
+++ b/tools/frame_interpolation/utils/dist_utils.py
@@ -0,0 +1,48 @@
+import os
+
+import torch
+
+
+def get_world_size():
+    """Find OMPI world size without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get("PMI_SIZE") is not None:
+        return int(os.environ.get("PMI_SIZE") or 1)
+    elif os.environ.get("OMPI_COMM_WORLD_SIZE") is not None:
+        return int(os.environ.get("OMPI_COMM_WORLD_SIZE") or 1)
+    else:
+        return torch.cuda.device_count()
+
+
+def get_global_rank():
+    """Find OMPI world rank without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get("PMI_RANK") is not None:
+        return int(os.environ.get("PMI_RANK") or 0)
+    elif os.environ.get("OMPI_COMM_WORLD_RANK") is not None:
+        return int(os.environ.get("OMPI_COMM_WORLD_RANK") or 0)
+    else:
+        return 0
+
+
+def get_local_rank():
+    """Find OMPI local rank without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get("MPI_LOCALRANKID") is not None:
+        return int(os.environ.get("MPI_LOCALRANKID") or 0)
+    elif os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") is not None:
+        return int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") or 0)
+    else:
+        return 0
+
+
+def get_master_ip():
+    if os.environ.get("AZ_BATCH_MASTER_NODE") is not None:
+        return os.environ.get("AZ_BATCH_MASTER_NODE").split(":")[0]
+    elif os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE") is not None:
+        return os.environ.get("AZ_BATCHAI_MPI_MASTER_NODE")
+    else:
+        return "127.0.0.1"
diff --git a/tools/frame_interpolation/utils/flow_utils.py b/tools/frame_interpolation/utils/flow_utils.py
new file mode 100644
index 0000000..d16fe71
--- /dev/null
+++ b/tools/frame_interpolation/utils/flow_utils.py
@@ -0,0 +1,125 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def warp(img, flow):
+    B, _, H, W = flow.shape
+    xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1)
+    yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W)
+    grid = torch.cat([xx, yy], 1).to(img)
+    flow_ = torch.cat([flow[:, 0:1, :, :] / ((W - 1.0) / 2.0), flow[:, 1:2, :, :] / ((H - 1.0) / 2.0)], 1)
+    grid_ = (grid + flow_).permute(0, 2, 3, 1)
+    output = F.grid_sample(input=img, grid=grid_, mode="bilinear", padding_mode="border", align_corners=True)
+    return output
+
+
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the Matlab source code of Deqing Sun.
+    Returns:
+        np.ndarray: Color wheel
+    """
+
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
+    col = col + RY
+    # YG
+    colorwheel[col : col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
+    colorwheel[col : col + YG, 1] = 255
+    col = col + YG
+    # GC
+    colorwheel[col : col + GC, 1] = 255
+    colorwheel[col : col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
+    col = col + GC
+    # CB
+    colorwheel[col : col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
+    colorwheel[col : col + CB, 2] = 255
+    col = col + CB
+    # BM
+    colorwheel[col : col + BM, 2] = 255
+    colorwheel[col : col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
+    col = col + BM
+    # MR
+    colorwheel[col : col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
+    colorwheel[col : col + MR, 0] = 255
+    return colorwheel
+
+
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u) / np.pi
+    fk = (a + 1) / 2 * (ncols - 1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1 - f) * col0 + f * col1
+        idx = rad <= 1
+        col[idx] = 1 - rad[idx] * (1 - col[idx])
+        col[~idx] = col[~idx] * 0.75  # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2 - i if convert_to_bgr else i
+        flow_image[:, :, ch_idx] = np.floor(255 * col)
+    return flow_image
+
+
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, "input flow must have three dimensions"
+    assert flow_uv.shape[2] == 2, "input flow must have shape [H,W,2]"
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[:, :, 0]
+    v = flow_uv[:, :, 1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)
diff --git a/tools/frame_interpolation/utils/utils.py b/tools/frame_interpolation/utils/utils.py
new file mode 100644
index 0000000..285a65f
--- /dev/null
+++ b/tools/frame_interpolation/utils/utils.py
@@ -0,0 +1,314 @@
+import random
+import re
+import sys
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from imageio import imread, imwrite
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+class AverageMeter:
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0.0
+        self.avg = 0.0
+        self.sum = 0.0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+class AverageMeterGroups:
+    def __init__(self) -> None:
+        self.meter_dict = dict()
+
+    def update(self, dict, n=1):
+        for name, val in dict.items():
+            if self.meter_dict.get(name) is None:
+                self.meter_dict[name] = AverageMeter()
+            self.meter_dict[name].update(val, n)
+
+    def reset(self, name=None):
+        if name is None:
+            for v in self.meter_dict.values():
+                v.reset()
+        else:
+            meter = self.meter_dict.get(name)
+            if meter is not None:
+                meter.reset()
+
+    def avg(self, name):
+        meter = self.meter_dict.get(name)
+        if meter is not None:
+            return meter.avg
+
+
+class InputPadder:
+    """Pads images such that dimensions are divisible by divisor"""
+
+    def __init__(self, dims, divisor=16):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor
+        pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor
+        self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
+
+    def pad(self, *inputs):
+        if len(inputs) == 1:
+            return F.pad(inputs[0], self._pad, mode="replicate")
+        else:
+            return [F.pad(x, self._pad, mode="replicate") for x in inputs]
+
+    def unpad(self, *inputs):
+        if len(inputs) == 1:
+            return self._unpad(inputs[0])
+        else:
+            return [self._unpad(x) for x in inputs]
+
+    def _unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0] : c[1], c[2] : c[3]]
+
+
+def img2tensor(img):
+    if img.shape[-1] > 3:
+        img = img[:, :, :3]
+    return torch.tensor(img).permute(2, 0, 1).unsqueeze(0) / 255.0
+
+
+def tensor2img(img_t):
+    return (img_t * 255.0).detach().squeeze(0).permute(1, 2, 0).cpu().numpy().clip(0, 255).astype(np.uint8)
+
+
+def seed_all(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def read(file):
+    if file.endswith(".float3"):
+        return readFloat(file)
+    elif file.endswith(".flo"):
+        return readFlow(file)
+    elif file.endswith(".ppm"):
+        return readImage(file)
+    elif file.endswith(".pgm"):
+        return readImage(file)
+    elif file.endswith(".png"):
+        return readImage(file)
+    elif file.endswith(".jpg"):
+        return readImage(file)
+    elif file.endswith(".pfm"):
+        return readPFM(file)[0]
+    else:
+        raise Exception("don't know how to read %s" % file)
+
+
+def write(file, data):
+    if file.endswith(".float3"):
+        return writeFloat(file, data)
+    elif file.endswith(".flo"):
+        return writeFlow(file, data)
+    elif file.endswith(".ppm"):
+        return writeImage(file, data)
+    elif file.endswith(".pgm"):
+        return writeImage(file, data)
+    elif file.endswith(".png"):
+        return writeImage(file, data)
+    elif file.endswith(".jpg"):
+        return writeImage(file, data)
+    elif file.endswith(".pfm"):
+        return writePFM(file, data)
+    else:
+        raise Exception("don't know how to write %s" % file)
+
+
+def readPFM(file):
+    file = open(file, "rb")
+
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().rstrip()
+    if header.decode("ascii") == "PF":
+        color = True
+    elif header.decode("ascii") == "Pf":
+        color = False
+    else:
+        raise Exception("Not a PFM file.")
+
+    dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
+    if dim_match:
+        width, height = list(map(int, dim_match.groups()))
+    else:
+        raise Exception("Malformed PFM header.")
+
+    scale = float(file.readline().decode("ascii").rstrip())
+    if scale < 0:
+        endian = "<"
+        scale = -scale
+    else:
+        endian = ">"
+
+    data = np.fromfile(file, endian + "f")
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data, scale
+
+
+def writePFM(file, image, scale=1):
+    file = open(file, "wb")
+
+    color = None
+
+    if image.dtype.name != "float32":
+        raise Exception("Image dtype must be float32.")
+
+    image = np.flipud(image)
+
+    if len(image.shape) == 3 and image.shape[2] == 3:
+        color = True
+    elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:
+        color = False
+    else:
+        raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
+
+    file.write("PF\n" if color else "Pf\n".encode())
+    file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
+
+    endian = image.dtype.byteorder
+
+    if endian == "<" or endian == "=" and sys.byteorder == "little":
+        scale = -scale
+
+    file.write("%f\n".encode() % scale)
+
+    image.tofile(file)
+
+
+def readFlow(name):
+    if name.endswith(".pfm") or name.endswith(".PFM"):
+        return readPFM(name)[0][:, :, 0:2]
+
+    f = open(name, "rb")
+
+    header = f.read(4)
+    if header.decode("utf-8") != "PIEH":
+        raise Exception("Flow file header does not contain PIEH")
+
+    width = np.fromfile(f, np.int32, 1).squeeze()
+    height = np.fromfile(f, np.int32, 1).squeeze()
+
+    flow = np.fromfile(f, np.float32, width * height * 2).reshape((height, width, 2))
+
+    return flow.astype(np.float32)
+
+
+def readImage(name):
+    if name.endswith(".pfm") or name.endswith(".PFM"):
+        data = readPFM(name)[0]
+        if len(data.shape) == 3:
+            return data[:, :, 0:3]
+        else:
+            return data
+    return imread(name)
+
+
+def writeImage(name, data):
+    if name.endswith(".pfm") or name.endswith(".PFM"):
+        return writePFM(name, data, 1)
+    return imwrite(name, data)
+
+
+def writeFlow(name, flow):
+    f = open(name, "wb")
+    f.write("PIEH".encode("utf-8"))
+    np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
+    flow = flow.astype(np.float32)
+    flow.tofile(f)
+
+
+def readFloat(name):
+    f = open(name, "rb")
+
+    if (f.readline().decode("utf-8")) != "float\n":
+        raise Exception("float file %s did not contain <float> keyword" % name)
+
+    dim = int(f.readline())
+
+    dims = []
+    count = 1
+    for i in range(0, dim):
+        d = int(f.readline())
+        dims.append(d)
+        count *= d
+
+    dims = list(reversed(dims))
+
+    data = np.fromfile(f, np.float32, count).reshape(dims)
+    if dim > 2:
+        data = np.transpose(data, (2, 1, 0))
+        data = np.transpose(data, (1, 0, 2))
+
+    return data
+
+
+def writeFloat(name, data):
+    f = open(name, "wb")
+
+    dim = len(data.shape)
+    if dim > 3:
+        raise Exception("bad float file dimension: %d" % dim)
+
+    f.write(("float\n").encode("ascii"))
+    f.write(("%d\n" % dim).encode("ascii"))
+
+    if dim == 1:
+        f.write(("%d\n" % data.shape[0]).encode("ascii"))
+    else:
+        f.write(("%d\n" % data.shape[1]).encode("ascii"))
+        f.write(("%d\n" % data.shape[0]).encode("ascii"))
+        for i in range(2, dim):
+            f.write(("%d\n" % data.shape[i]).encode("ascii"))
+
+    data = data.astype(np.float32)
+    if dim == 2:
+        data.tofile(f)
+
+    else:
+        np.transpose(data, (2, 0, 1)).tofile(f)
+
+
+def check_dim_and_resize(tensor_list):
+    shape_list = []
+    for t in tensor_list:
+        shape_list.append(t.shape[2:])
+
+    if len(set(shape_list)) > 1:
+        desired_shape = shape_list[0]
+        print(f"Inconsistent size of input video frames. All frames will be resized to {desired_shape}")
+
+        resize_tensor_list = []
+        for t in tensor_list:
+            resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode="bilinear"))
+
+        tensor_list = resize_tensor_list
+
+    return tensor_list
diff --git a/tools/scene_cut/README.md b/tools/scene_cut/README.md
new file mode 100644
index 0000000..1c1bae5
--- /dev/null
+++ b/tools/scene_cut/README.md
@@ -0,0 +1,63 @@
+# Scene Detection and Video Splitting
+
+- [Scene Detection and Video Splitting](#scene-detection-and-video-splitting)
+    - [Prepare Meta Files](#prepare-meta-files)
+    - [Scene Detection](#scene-detection)
+    - [Video Splitting](#video-splitting)
+
+In many cases, raw videos contain several scenes and are too long for training. Thus, it is essential to split them into shorter
+clips based on scenes. Here, we provide code for scene detection and video splitting.
+
+## Prepare Meta Files
+At this step, you should have a raw video dataset prepared. A meta file of the dataset information is needed for data processing. To create a meta file from a folder, run:
+
+```bash
+python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv
+```
+This should output a `.csv` file with column `path`.
+
+If you already have a meta file for the videos and want to keep the information.
+**Make sure** the meta file has column `id`, which is the id for each video, and the video is named as `{id}.mp4`.
+The following command will add a new column `path` to the meta file.
+
+```bash
+python tools/scene_cut/convert_id_to_path.py /path/to/meta.csv --folder_path /path/to/video/folder
+```
+This should output
+- `{prefix}_path-filtered.csv` with column `path` (broken videos filtered)
+- `{prefix}_path_intact.csv` with column `path` and `intact` (`intact` indicating a video is intact or not)
+
+
+## Scene Detection
+
+Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "Scene Detection" sections.
+
+<!-- The next step is to detect scenes in a video.
+We use [`PySceneDetect`](https://github.com/Breakthrough/PySceneDetect) for this job.
+```bash
+pip install scenedetect[opencv] --upgrade
+``` -->
+
+**Make sure** the input meta file has column `path`, which is the path of a video.
+
+```bash
+python tools/scene_cut/scene_detect.py /path/to/meta.csv
+```
+The output is `{prefix}_timestamp.csv` with column `timestamp`. Each cell in column `timestamp` is a list of tuples,
+with each tuple indicating the start and end timestamp of a scene
+(e.g., `[('00:00:01.234', '00:00:02.345'), ('00:00:03.456', '00:00:04.567')]`).
+
+## Video Splitting
+After obtaining timestamps for scenes, we conduct video splitting (cutting).
+**Make sure** the meta file contains column `timestamp`.
+
+```bash
+python tools/scene_cut/cut.py /path/to/meta.csv --save_dir /path/to/output/dir
+```
+
+This will save video clips to `/path/to/output/dir`. The video clips are named as `{video_id}_scene-{scene_id}.mp4`
+
+To create a new meta file for the generated clips, run:
+```bash
+python -m tools.datasets.convert video /path/to/video/folder --output /path/to/save/meta.csv
+```
diff --git a/tools/scene_cut/__init__.py b/tools/scene_cut/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/scene_cut/convert_id_to_path.py b/tools/scene_cut/convert_id_to_path.py
new file mode 100644
index 0000000..b8122d3
--- /dev/null
+++ b/tools/scene_cut/convert_id_to_path.py
@@ -0,0 +1,131 @@
+import argparse
+import json
+import os
+from functools import partial
+
+import cv2
+import numpy as np
+import pandas as pd
+from mmengine.logging import print_log
+from moviepy.editor import VideoFileClip
+from pandarallel import pandarallel
+from tqdm import tqdm
+
+tqdm.pandas()
+
+
+def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None):
+    if not os.path.exists(video_path):
+        if verbose:
+            print_log(f"Could not find '{video_path}'", logger=logger)
+        return False
+
+    if mode == "moviepy":
+        try:
+            VideoFileClip(video_path)
+            if verbose:
+                print_log(f"The video file '{video_path}' is intact.", logger=logger)
+            return True
+        except Exception as e:
+            if verbose:
+                print_log(f"Error: {e}", logger=logger)
+                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
+            return False
+    elif mode == "cv2":
+        try:
+            cap = cv2.VideoCapture(video_path)
+            if cap.isOpened():
+                if verbose:
+                    print_log(f"The video file '{video_path}' is intact.", logger=logger)
+                return True
+        except Exception as e:
+            if verbose:
+                print_log(f"Error: {e}", logger=logger)
+                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
+            return False
+    else:
+        raise ValueError
+
+
+def has_downloaded_success(json_path):
+    if not os.path.exists(json_path):
+        return False
+
+    try:
+        with open(json_path, "r") as f:
+            data = json.load(f)
+            if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False:
+                return False
+    except Exception:
+        return False
+
+    return True
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str)
+    parser.add_argument("--folder_path", type=str, required=True)
+    parser.add_argument("--mode", type=str, default=None)
+    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    meta_path = args.meta_path
+    folder_path = args.folder_path
+    mode = args.mode
+
+    def is_intact(row, mode=None):
+        video_id = row["id"]
+        video_path = os.path.join(folder_path, f"{video_id}.mp4")
+        row["path"] = video_path
+
+        if mode == ".mp4":
+            if is_intact_video(video_path):
+                return True, video_path
+            return False, video_path
+        elif mode == ".json":
+            # json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json")
+            json_path = os.path.join(folder_path, f"{video_id}.json")
+            if has_downloaded_success(json_path):
+                return True, video_path
+            return False, video_path
+        elif mode is None:
+            return True, video_path
+        else:
+            raise ValueError
+
+    meta_dirpath = os.path.dirname(meta_path)
+    meta_fname = os.path.basename(meta_path)
+    wo_ext, ext = os.path.splitext(meta_fname)
+
+    if args.num_workers is not None:
+        pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
+    else:
+        pandarallel.initialize(progress_bar=True)
+    is_intact_partial = partial(is_intact, mode=mode)
+
+    meta = pd.read_csv(meta_path)
+    ret = meta.parallel_apply(is_intact_partial, axis=1)
+    intact, paths = list(zip(*ret))
+
+    meta["intact"] = intact
+    meta["path"] = paths
+    out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv")
+    meta.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'")
+
+    meta_format = meta[np.array(intact)]
+    meta_format.drop("intact", axis=1, inplace=True)
+    out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv")
+    meta_format.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scene_cut/cut.py b/tools/scene_cut/cut.py
new file mode 100644
index 0000000..e19c012
--- /dev/null
+++ b/tools/scene_cut/cut.py
@@ -0,0 +1,214 @@
+import cv2  # isort:skip
+
+import argparse
+import os
+import subprocess
+from functools import partial
+
+import pandas as pd
+from imageio_ffmpeg import get_ffmpeg_exe
+from pandarallel import pandarallel
+from scenedetect import FrameTimecode
+from tqdm import tqdm
+
+tqdm.pandas()
+
+
+def print_log(s, logger=None):
+    if logger is not None:
+        logger.info(s)
+    else:
+        print(s)
+
+
+def process_single_row(row, args):
+    video_path = row["path"]
+
+    logger = None
+
+    # check mp4 integrity
+    # if not is_intact_video(video_path, logger=logger):
+    #     return False
+    try:
+        if "timestamp" in row:
+            timestamp = row["timestamp"]
+            if not (timestamp.startswith("[") and timestamp.endswith("]")):
+                return False
+            scene_list = eval(timestamp)
+            # Jul 19 quick fix Tom: remove 0.1s from the end of each scene to avoid the last frame issue
+            # scene_list = [(FrameTimecode(s, fps=1000), FrameTimecode(t, fps=1000) - FrameTimecode('00:00:00.100', fps=1000)) for s, t in scene_list]
+            scene_list = [(FrameTimecode(s, fps=1000), FrameTimecode(t, fps=1000)) for s, t in scene_list]
+        else:
+            scene_list = [None]
+        if args.drop_invalid_timestamps:
+            return True
+    except Exception:
+        if args.drop_invalid_timestamps:
+            return False
+
+    if "relpath" in row:
+        save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
+        os.makedirs(save_dir, exist_ok=True)
+    else:
+        save_dir = args.save_dir
+
+    shorter_size = args.shorter_size
+    if (shorter_size is not None) and ("height" in row) and ("width" in row):
+        # Jul 13 Tom: this is skipped cause we are not doing info before cut atm
+        min_size = min(row["height"], row["width"])
+        if min_size <= shorter_size:
+            shorter_size = None
+
+    split_video(
+        video_path,
+        scene_list,
+        save_dir=save_dir,
+        min_seconds=args.min_seconds,
+        max_seconds=args.max_seconds,
+        target_fps=args.target_fps,
+        shorter_size=shorter_size,
+        logger=logger,
+    )
+    return True
+
+
+def split_video(
+    video_path,
+    scene_list,
+    save_dir,
+    min_seconds=2,
+    max_seconds=15,
+    target_fps=30,
+    shorter_size=None,
+    verbose=False,
+    logger=None,
+):
+    """
+    scenes shorter than min_seconds will be ignored;
+    scenes longer than max_seconds will be cut to save the beginning max_seconds.
+    Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4
+
+    Args:
+        scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene.
+        min_seconds (float | None)
+        max_seconds (float | None)
+        target_fps (int | None)
+        shorter_size (int | None)
+    """
+    FFMPEG_PATH = get_ffmpeg_exe()
+
+    save_path_list = []
+    for idx, scene in enumerate(scene_list):
+        if scene is not None:
+            s, t = scene  # FrameTimecode
+            if min_seconds is not None:
+                if (t - s).get_seconds() < min_seconds:
+                    continue
+
+            duration = t - s
+            if max_seconds is not None:
+                fps = s.framerate
+                max_duration = FrameTimecode(max_seconds, fps=fps)
+                duration = min(max_duration, duration)
+
+        # save path
+        fname = os.path.basename(video_path)
+        fname_wo_ext = os.path.splitext(fname)[0]
+        # TODO: fname pattern
+        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
+        if os.path.exists(save_path):
+            # print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
+            continue
+
+        # ffmpeg cmd
+        cmd = [FFMPEG_PATH]
+
+        # Only show ffmpeg output for the first call, which will display any
+        # errors if it fails, and then break the loop. We only show error messages
+        # for the remaining calls.
+        # cmd += ['-v', 'error']
+
+        # clip to cut
+        # Note: -ss after -i is very slow; put -ss before -i !!!
+        if scene is None:
+            cmd += ["-nostdin", "-y", "-i", video_path]
+        else:
+            cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())]
+
+        # target fps
+        if target_fps is not None:
+            cmd += ["-r", f"{target_fps}"]
+
+        # aspect ratio
+        if shorter_size is not None:
+            cmd += ["-vf", f"scale='if(gt(iw,ih),-2,min({shorter_size},iw))':'if(gt(iw,ih),min({shorter_size},ih),-2)'"]
+            # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
+
+        cmd += ["-map", "0:v", save_path]
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        stdout, stderr = proc.communicate()
+        # stdout = stdout.decode("utf-8")
+        # print_log(stdout, logger=logger)
+
+        save_path_list.append(video_path)
+        if verbose:
+            print_log(f"Video clip saved to '{save_path}'", logger=logger)
+
+    return save_path_list
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str)
+    parser.add_argument("--save_dir", type=str)
+    parser.add_argument(
+        "--min_seconds", type=float, default=None, help="if not None, clip shorter than min_seconds is ignored"
+    )
+    parser.add_argument(
+        "--max_seconds", type=float, default=None, help="if not None, clip longer than max_seconds is truncated"
+    )
+    parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips")
+    parser.add_argument(
+        "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale"
+    )
+    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
+    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
+    parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    # create save_dir
+    os.makedirs(args.save_dir, exist_ok=True)
+
+    # initialize pandarallel
+    if not args.disable_parallel:
+        if args.num_workers is not None:
+            pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
+        else:
+            pandarallel.initialize(progress_bar=True)
+    process_single_row_partial = partial(process_single_row, args=args)
+
+    # process
+    meta = pd.read_csv(args.meta_path)
+    if not args.disable_parallel:
+        results = meta.parallel_apply(process_single_row_partial, axis=1)
+    else:
+        results = meta.apply(process_single_row_partial, axis=1)
+    if args.drop_invalid_timestamps:
+        meta = meta[results]
+        assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
+        meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
+        print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
+
+
+if __name__ == "__main__":
+    main()
+    print("cut.py finished successfully.")
diff --git a/tools/scene_cut/cut_to_short.py b/tools/scene_cut/cut_to_short.py
new file mode 100644
index 0000000..ea6aa32
--- /dev/null
+++ b/tools/scene_cut/cut_to_short.py
@@ -0,0 +1,69 @@
+import argparse
+import os
+import subprocess
+from functools import partial
+
+import pandas as pd
+from pandarallel import pandarallel
+from tqdm import tqdm
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str)
+    parser.add_argument("--length", type=int, required=True, help="segment length in seconds; example: 1800")
+    parser.add_argument("--src_dir", type=str, required=True, help="exmaple: /path/to/dataset/raw/")
+    parser.add_argument("--dst_dir", type=str, required=True, help="exmaple: /path/to/dataset/cut_to_30min/")
+
+    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
+    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
+
+    args = parser.parse_args()
+    return args
+
+
+def process_single_row(row, args):
+    path = row["path"]
+    assert path.startswith(args.src_dir), f"\npath: {path}\nsrc_dir:{args.src_dir}"
+
+    out_path = os.path.join(args.dst_dir, os.path.relpath(path, args.src_dir))
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    wo_ext, ext = os.path.splitext(out_path)
+    cmd = (
+        f"ffmpeg -i {path} "
+        f"-c copy -an "  # -an: no audio
+        f"-f segment -segment_time 60 -reset_timestamps 1 -map 0 -segment_start_number 0 "
+        f"{wo_ext}_%03d{ext}"
+    )
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
+    stdout, stderr = proc.communicate()
+
+
+def main():
+    args = parse_args()
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    # initialize pandarallel
+    tqdm.pandas()
+    if not args.disable_parallel:
+        if args.num_workers is not None:
+            pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
+        else:
+            pandarallel.initialize(progress_bar=True)
+    process_single_row_partial = partial(process_single_row, args=args)
+
+    # process
+    meta = pd.read_csv(meta_path)
+    if not args.disable_parallel:
+        meta.parallel_apply(process_single_row_partial, axis=1)
+    else:
+        meta.apply(process_single_row_partial, axis=1)
+
+    print("cut_to_short.py finished successfully.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scene_cut/scene_detect.py b/tools/scene_cut/scene_detect.py
new file mode 100644
index 0000000..eb9ffd9
--- /dev/null
+++ b/tools/scene_cut/scene_detect.py
@@ -0,0 +1,262 @@
+import argparse
+import os
+from functools import partial
+
+import numpy as np
+import pandas as pd
+from pandarallel import pandarallel
+from scenedetect import AdaptiveDetector, SceneManager, StatsManager, open_video
+from tqdm import tqdm
+
+tqdm.pandas()
+
+
+def detect(
+    video_path,
+    detector,
+    backend="pyav",
+    stats_file_path=None,
+    show_progress=False,
+    start_time=None,
+    end_time=None,
+    start_in_scene=False,
+):
+    """
+    Adapted from scenedetect.detect()
+    Modifications:
+        - allow passing backend to open_video()
+    """
+    video = open_video(video_path, backend=backend)
+    if start_time is not None:
+        start_time = video.base_timecode + start_time
+        video.seek(start_time)
+    if end_time is not None:
+        end_time = video.base_timecode + end_time
+    # To reduce memory consumption when not required, we only add a StatsManager if we
+    # need to save frame metrics to disk.
+    scene_manager = SceneManager(StatsManager() if stats_file_path else None)
+    scene_manager.add_detector(detector)
+    scene_manager.detect_scenes(
+        video=video,
+        show_progress=show_progress,
+        end_time=end_time,
+    )
+    if not scene_manager.stats_manager is None:
+        scene_manager.stats_manager.save_to_csv(csv_file=stats_file_path)
+    return scene_manager.get_scene_list(start_in_scene=start_in_scene)
+
+
+def detect_transition(
+    video_path,
+    backend="pyav",
+    transition_seconds=1.0,
+    start_time=None,
+    end_time=None,
+    start_in_scene=False,
+    stats_file_path=None,
+    show_progress=False,
+):
+    video = open_video(video_path, backend=backend)
+    fps = video.frame_rate
+
+    frame_skip = int(transition_seconds * fps)
+    t_ada, t_con = 2.0, 10.0
+    window = 1
+    if end_time is not None:
+        end_time = video.base_timecode + end_time
+
+    # 1. detect from the very beginning
+    detector = AdaptiveDetector(
+        adaptive_threshold=t_ada,  # default 3.0
+        min_content_val=t_con,  # default 15.0
+        min_scene_len=1,
+        window_width=window,
+    )
+    scene_manager = SceneManager(StatsManager() if stats_file_path else None)
+    scene_manager.add_detector(detector)
+
+    if start_time is not None:
+        tmp = video.base_timecode + start_time
+        video.seek(tmp)
+
+    scene_manager.detect_scenes(
+        video=video,
+        show_progress=show_progress,
+        frame_skip=frame_skip,
+        end_time=end_time,
+    )
+    if scene_manager.stats_manager is not None:
+        scene_manager.stats_manager.save_to_csv(csv_file=stats_file_path)
+    s0 = scene_manager.get_scene_list(start_in_scene=start_in_scene)
+
+    # 2. detect from half transition_seconds
+    detector = AdaptiveDetector(
+        adaptive_threshold=t_ada,  # default 3.0
+        min_content_val=t_con,  # default 15.0
+        min_scene_len=1,
+        window_width=window,
+    )
+    scene_manager = SceneManager(StatsManager() if stats_file_path else None)
+    scene_manager.add_detector(detector)
+
+    tmp = video.base_timecode + transition_seconds / 2
+    if start_time is not None:
+        tmp = tmp + start_time
+    video.seek(tmp)
+
+    scene_manager.detect_scenes(
+        video=video,
+        show_progress=show_progress,
+        frame_skip=frame_skip,
+        end_time=end_time,
+    )
+    if scene_manager.stats_manager is not None:
+        scene_manager.stats_manager.save_to_csv(csv_file=f"{stats_file_path}.csv")
+    s1 = scene_manager.get_scene_list(start_in_scene=start_in_scene)
+
+    # merge s0, s1
+    s0_s = [x[0] for x in s0]
+    s1_t = [x[1] for x in s1]
+    i, j = 0, 0
+    merged = []
+    while i < len(s0_s) and j < len(s1_t):
+        l, r = s0_s[i], s1_t[j]
+        if l < r:
+            merged.append(l)
+            i += 1
+        elif l == r:
+            merged.append(l)
+            i += 1
+            j += 1
+        else:
+            merged.append(r)
+            j += 1
+    merged.extend(s0_s[i:])
+    merged.extend(s1_t[j:])
+
+    # remove transitions
+    scene_list = []
+    m = transition_seconds
+    for idx in range(len(merged) - 1):
+        cur = merged[idx]
+        next = merged[idx + 1]
+        if next - cur < m:
+            continue
+
+        if idx + 2 < len(merged) and (merged[idx + 2] - next) < m:
+            # detected by both s0 & s1
+            next = next - m
+        elif idx + 2 < len(merged):
+            next = next - 1.5 * m
+        if idx - 1 >= 0 and (cur - merged[idx - 1]) < m:
+            # detected by both s0 & s1
+            cur = cur + 0.5 * m
+        elif idx > 0:
+            cur = cur + 0.5 * m
+
+        if cur < next:
+            scene_list.append((cur, next))
+
+    return scene_list
+
+
+def process_single_row(row, args):
+    video_path = row["path"]
+
+    try:
+        if args.transition_seconds is not None:
+            scene_list = detect_transition(
+                video_path,
+                transition_seconds=args.transition_seconds,
+                start_time=args.start_time,
+                end_time=args.end_time,
+                start_in_scene=True,
+            )
+            timestamp_intact = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
+        else:
+            detector = AdaptiveDetector(
+                adaptive_threshold=2.0,
+                min_content_val=10.0,
+                min_scene_len=1,
+            )
+            scene_list = detect(
+                video_path,
+                detector,
+                start_time=args.start_time,
+                end_time=args.end_time,
+                start_in_scene=True,
+            )
+            margin = 1.0 / scene_list[0][0].framerate
+            scene_list = [(s, t - margin) for s, t in scene_list if s < t - margin]
+            timestamp_intact = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
+
+        if args.max_seconds is not None:
+            timestamp_cut = []
+            for s, t in scene_list:
+                while (t - s).get_seconds() > args.max_seconds:
+                    tmp = s + args.max_seconds
+                    timestamp_cut.append((s.get_timecode(), tmp.get_timecode()))
+                    s = tmp
+                timestamp_cut.append((s.get_timecode(), t.get_timecode()))
+        else:
+            timestamp_cut = timestamp_intact
+        return True, str(timestamp_cut), str(timestamp_intact)
+
+    except Exception as e:
+        print(f"Video '{video_path}' with error {e}")
+        return False, "", ""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str)
+    parser.add_argument(
+        "--max_seconds", type=float, default=None, help="if not None, any single timestamp will not exceed max_seconds"
+    )
+    parser.add_argument("--transition_seconds", type=float, default=None, help="if not None, use detect_transition()")
+    parser.add_argument("--start_time", type=float, default=None, help="if not None, start detection from start_time")
+    parser.add_argument("--end_time", type=float, default=None, help="if not None, end detection at end_time")
+    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
+    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.transition_seconds is not None:
+        assert args.transition_seconds > 0
+
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    # initialize pandarallel
+    if not args.disable_parallel:
+        if args.num_workers is not None:
+            pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
+        else:
+            pandarallel.initialize(progress_bar=True)
+    process_single_row_partial = partial(process_single_row, args=args)
+
+    meta = pd.read_csv(meta_path)
+    if not args.disable_parallel:
+        ret = meta.parallel_apply(process_single_row_partial, axis=1)
+    else:
+        ret = meta.apply(process_single_row_partial, axis=1)
+
+    succ, timestamp_cut, timestamp_intact = list(zip(*ret))
+    meta["timestamp"] = timestamp_cut
+    meta["timestamp_intact"] = timestamp_intact
+    meta = meta[np.array(succ)]
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f"{wo_ext}_timestamp{ext}"
+    meta.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/README.md b/tools/scoring/README.md
new file mode 100644
index 0000000..40949c7
--- /dev/null
+++ b/tools/scoring/README.md
@@ -0,0 +1,115 @@
+# Scoring and Filtering
+
+- [Scoring and Filtering](#scoring-and-filtering)
+  - [Aesthetic Score](#aesthetic-score)
+  - [Optical Flow Score](#optical-flow-score)
+  - [OCR](#ocr)
+  - [Matching Score](#matching-score)
+  - [Filtering](#filtering)
+
+## Aesthetic Score
+
+To evaluate the aesthetic quality of videos, we use the scoring model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
+
+The aesthetic score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for high aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
+
+For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images as input.
+The throughput of our code is ~1K videos/s on a single H800 GPU. It also supports running on multiple GPUs for further acceleration.
+
+First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies".
+
+Next, download the scoring model to `./pretrained_models/aesthetic.pth`.
+
+```bash
+wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
+```
+
+<!-- First, install the required packages and download the scoring model to `./pretrained_models/aesthetic.pth`.
+```bash
+# pip install
+pip install git+https://github.com/openai/CLIP.git
+pip install decord
+
+# get pretrained model
+wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
+``` -->
+
+Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
+```bash
+torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16
+```
+This will generate multiple part files, each corresponding to a node . Run `python -m tools.datasets.datautil /path/to/meta_aes_part*.csv --output /path/to/meta_aes.csv` to merge them.
+
+## Optical Flow Score
+
+Optical flow scores are used to assess the motion of a video. Higher optical flow scores indicate larger movement.
+We use the [UniMatch](https://github.com/autonomousvision/unimatch) model for this task.
+
+First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies".
+
+Next, download the pretrained model to `./pretrained_model/unimatch/`
+```bash
+wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P ./pretrained_models/unimatch/
+```
+
+Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
+```bash
+torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv
+```
+
+This should output `/path/to/meta_flow.csv` with column `flow`.
+
+## OCR
+Some videos are of dense text scenes like news broadcast and advertisement, which are not desired for training.
+We apply Optical Character Recognition (OCR) to detect texts and drop samples with dense texts. Here, we use
+the [DBNet++](https://arxiv.org/abs/2202.10304) model implemented by [MMOCR](https://github.com/open-mmlab/mmocr/).
+
+First, install the required packages following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "OCR" section.
+
+<!-- First, install [MMOCR](https://mmocr.readthedocs.io/en/dev-1.x/get_started/install.html).
+For reference, we install packages of these versions.
+```
+torch==2.0.1
+mmcv==2.0.1
+mmdet==3.1.0
+mmocr==1.0.1
+``` -->
+
+Then, run the following command. **Make sure** the meta file has column `path` (path to the sample).
+<!-- ```bash
+torchrun --standalone --nproc_per_node 8 tools/scoring/ocr/inference.py /path/to/meta.csv
+``` -->
+```bash
+torchrun --standalone --nproc_per_node 8 -m tools.scoring.ocr.inference /path/to/meta.csv
+```
+This should output `/path/to/meta_ocr.csv` with column `ocr`, indicating the number of text regions with detection confidence > 0.3.
+
+
+## Matching Score
+
+Matching scores are calculated to evaluate the alignment between an image/video and its caption.
+Here, we use the [CLIP](https://github.com/openai/CLIP) model, which is trained on image-text pairs.
+We simply use the cosine similarity as the matching score.
+For videos, we extract the middle frame and compare it with the caption.
+
+First, install OpenAI CLIP.
+```bash
+pip install git+https://github.com/openai/CLIP.git
+```
+
+Then, run the following command. **Make sure** the meta file has column `path` (path to the sample) and `text` (caption of the sample).
+
+```bash
+torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv
+```
+
+This should output `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment.
+
+
+## Filtering
+Once scores are obtained, it is simple to filter samples based on these scores. Here is an example to remove
+samples of aesthetic score < 5.0.
+```
+python -m tools.datasets.datautil /path/to/meta.csv --aesmin 5.0
+```
+This should output `/path/to/meta_aesmin5.0.csv` with column `aes` >= 5.0
diff --git a/tools/scoring/__init__.py b/tools/scoring/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/scoring/aesthetic/__init__.py b/tools/scoring/aesthetic/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/scoring/aesthetic/inference.py b/tools/scoring/aesthetic/inference.py
new file mode 100644
index 0000000..d730e93
--- /dev/null
+++ b/tools/scoring/aesthetic/inference.py
@@ -0,0 +1,213 @@
+# adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py
+import cv2  # isort:skip
+
+import argparse
+import gc
+import os
+from datetime import timedelta
+
+import clip
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.utils.data import DataLoader, DistributedSampler
+from torchvision.datasets.folder import pil_loader
+from tqdm import tqdm
+
+from tools.datasets.utils import extract_frames, is_video
+
+NUM_FRAMES_POINTS = {
+    1: (0.5,),
+    2: (0.25, 0.5),
+    3: (0.1, 0.5, 0.9),
+}
+
+
+def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
+    # reorder
+    indices_list = list(map(lambda x: x[0], gathered_list))
+    scores_list = list(map(lambda x: x[1], gathered_list))
+
+    flat_indices = []
+    for x in zip(*indices_list):
+        flat_indices.extend(x)
+    flat_scores = []
+    for x in zip(*scores_list):
+        flat_scores.extend(x)
+    flat_indices = np.array(flat_indices)
+    flat_scores = np.array(flat_scores)
+
+    # filter duplicates
+    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
+    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
+
+    # drop indices in meta not in unique_indices
+    meta = meta.loc[unique_indices]
+    return meta
+
+
+class VideoTextDataset(torch.utils.data.Dataset):
+    def __init__(self, meta_path, transform=None, num_frames=3):
+        self.meta_path = meta_path
+        self.meta = pd.read_csv(meta_path)
+        self.transform = transform
+        self.points = NUM_FRAMES_POINTS[num_frames]
+
+    def __getitem__(self, index):
+        sample = self.meta.iloc[index]
+        path = sample["path"]
+
+        # extract frames
+        if not is_video(path):
+            images = [pil_loader(path)]
+        else:
+            images = extract_frames(sample["path"], points=self.points, backend="opencv")
+
+        # transform
+        images = [self.transform(img) for img in images]
+
+        # stack
+        images = torch.stack(images)
+
+        ret = dict(index=index, images=images)
+        return ret
+
+    def __len__(self):
+        return len(self.meta)
+
+
+class MLP(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.layers = nn.Sequential(
+            nn.Linear(self.input_size, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            nn.Linear(16, 1),
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class AestheticScorer(nn.Module):
+    def __init__(self, input_size, device):
+        super().__init__()
+        self.mlp = MLP(input_size)
+        self.clip, self.preprocess = clip.load("ViT-L/14", device=device)
+
+        self.eval()
+        self.to(device)
+
+    def forward(self, x):
+        image_features = self.clip.encode_image(x)
+        image_features = F.normalize(image_features, p=2, dim=-1).float()
+        return self.mlp(image_features)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--bs", type=int, default=1024, help="Batch size")
+    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
+    parser.add_argument("--prefetch_factor", type=int, default=3, help="Prefetch factor")
+    parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract")
+    parser.add_argument("--skip_if_existing", action="store_true")
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f"{wo_ext}_aes{ext}"
+    if args.skip_if_existing and os.path.exists(out_path):
+        print(f"Output meta file '{out_path}' already exists. Exit.")
+        exit()
+
+    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
+    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+
+    # build model
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = AestheticScorer(768, device)
+    model.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth", map_location=device))
+    preprocess = model.preprocess
+
+    # build dataset
+    dataset = VideoTextDataset(args.meta_path, transform=preprocess, num_frames=args.num_frames)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.bs,
+        num_workers=args.num_workers,
+        sampler=DistributedSampler(
+            dataset,
+            num_replicas=dist.get_world_size(),
+            rank=dist.get_rank(),
+            shuffle=False,
+            drop_last=False,
+        ),
+    )
+
+    # compute aesthetic scores
+    indices_list = []
+    scores_list = []
+    model.eval()
+    for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
+        indices = batch["index"]
+        images = batch["images"].to(device, non_blocking=True)
+
+        B = images.shape[0]
+        images = rearrange(images, "B N C H W -> (B N) C H W")
+
+        # compute score
+        with torch.no_grad():
+            scores = model(images)
+
+        scores = rearrange(scores, "(B N) 1 -> B N", B=B)
+        scores = scores.mean(dim=1)
+        scores_np = scores.to(torch.float32).cpu().numpy()
+
+        indices_list.extend(indices.tolist())
+        scores_list.extend(scores_np.tolist())
+
+    # save local results
+    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="aes")
+    save_dir_local = os.path.join(os.path.dirname(out_path), "parts")
+    os.makedirs(save_dir_local, exist_ok=True)
+    out_path_local = os.path.join(
+        save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv")
+    )
+    meta_local.to_csv(out_path_local, index=False)
+
+    # wait for all ranks to finish data processing
+    dist.barrier()
+
+    torch.cuda.empty_cache()
+    gc.collect()
+    gathered_list = [None] * dist.get_world_size()
+    dist.all_gather_object(gathered_list, (indices_list, scores_list))
+    if dist.get_rank() == 0:
+        meta_new = merge_scores(gathered_list, dataset.meta, column="aes")
+        meta_new.to_csv(out_path, index=False)
+        print(f"New meta with aesthetic scores saved to '{out_path}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/deduplicate/__init__.py b/tools/scoring/deduplicate/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/scoring/deduplicate/inference.py b/tools/scoring/deduplicate/inference.py
new file mode 100644
index 0000000..84ed1ad
--- /dev/null
+++ b/tools/scoring/deduplicate/inference.py
@@ -0,0 +1,176 @@
+import argparse
+import os
+from datetime import timedelta
+
+import clip
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, DistributedSampler
+from torchvision.datasets.folder import pil_loader
+from tqdm import tqdm
+
+from tools.datasets.utils import extract_frames, is_video
+
+
+class VideoDataset(torch.utils.data.Dataset):
+    def __init__(self, meta_path, transform):
+        self.meta_path = meta_path
+        self.meta = pd.read_csv(meta_path)
+        self.transform = transform
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+        path = row["path"]
+
+        if is_video(path):
+            img = extract_frames(path, points=[0.5], backend="opencv")[0]
+        else:
+            img = pil_loader(path)
+        img_size = img.size  # W, H
+
+        img = self.transform(img)
+
+        ret = dict(index=index, images=img, img_size=str(img_size))
+        return ret
+
+    def __len__(self):
+        return len(self.meta)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--bs", type=int, default=128, help="Batch size")
+    parser.add_argument("--thresh", type=float, default=0.98, help="similarity thresh")
+    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path_dedup = f"{wo_ext}_dedup{ext}"
+    out_path_dup = f"{wo_ext}_dup{ext}"
+
+    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
+    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+
+    # build model
+    device = "cuda" if torch.cuda.is_available() else "cpu"  # Note: do not use torch.device('cuda')!!!
+    model, preprocess = clip.load("ViT-L/14", device=device)
+    # preprocess: resize shorter size to 224 by keeping ar, then center crop
+
+    # build dataset
+    dataset = VideoDataset(meta_path=meta_path, transform=preprocess)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.bs,
+        num_workers=args.num_workers,
+        sampler=DistributedSampler(
+            dataset,
+            num_replicas=dist.get_world_size(),
+            rank=dist.get_rank(),
+            shuffle=False,
+            drop_last=False,
+        ),
+    )
+
+    # encode images and store the feature
+    print("Begin to generate features")
+    model.eval()
+    feat_list = []  # store the feature
+    indices_list = []  # store the indices
+    img_size_list = []
+    for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
+        indices = batch["index"]
+        imgs = batch["images"].to(device, non_blocking=True)
+        img_size = batch["img_size"]
+
+        with torch.no_grad():
+            feat_img = model.encode_image(imgs)
+        feat_img = F.normalize(feat_img, dim=1)
+
+        feat_list.append(feat_img)
+        indices_list.extend(indices.tolist())
+        img_size_list.extend(img_size)
+
+    feats = torch.cat(feat_list, dim=0)
+
+    # all_gather
+    feats_gathered = [torch.zeros_like(feats, device=device) for _ in range(dist.get_world_size())]
+    dist.all_gather(feats_gathered, feats)
+    feats_all = torch.cat(feats_gathered)
+
+    indices_gathered = [None for _ in range(dist.get_world_size())]
+    dist.all_gather_object(indices_gathered, indices_list)
+    indices_all = np.array([x for sub in indices_gathered for x in sub])
+
+    img_size_gathered = [None for _ in range(dist.get_world_size())]
+    dist.all_gather_object(img_size_gathered, img_size_list)
+    img_size_all = [x for sub in img_size_gathered for x in sub]
+
+    indices_unique, indices_unique_idx = np.unique(indices_all, return_index=True)
+    feats_unique = feats_all[torch.from_numpy(indices_unique_idx)]
+    img_size_unique = [img_size_all[x] for x in indices_unique_idx]
+
+    if dist.get_rank() == 0:
+        # get similarity scores
+        non_zero_list = []
+        sim_scores_list = []
+        chunk = 1000
+        for idx in tqdm(range(0, feats_unique.shape[0], chunk)):
+            sim_mat = torch.matmul(feats_unique[idx : idx + chunk], feats_unique[idx:].T).cpu().numpy()
+            sim_mat_upper = np.triu(sim_mat, k=1)
+            non_zero_i = np.nonzero(sim_mat_upper >= args.thresh)
+            sim_scores_i = sim_mat[non_zero_i]
+
+            non_zero_np = np.stack(non_zero_i) + idx  # [2, N]
+            non_zero_list.append(non_zero_np)
+            sim_scores_list.append(sim_scores_i)
+
+        non_zero_indices = np.concatenate(non_zero_list, axis=1)
+        sim_scores = np.concatenate(sim_scores_list)
+        dup_dict = {}
+        for x, y, s in zip(non_zero_indices[0].tolist(), non_zero_indices[1].tolist(), sim_scores.tolist()):
+            # only count pairs with same the same size
+            if img_size_unique[x] != img_size_unique[y]:
+                continue
+
+            if y not in dup_dict:
+                dup_dict[y] = (x, s)
+            elif dup_dict[y][1] < s:
+                dup_dict[y] = (x, s)
+
+        dup_list = [(k, v) for k, v in dup_dict.items()]
+        dup_list = sorted(dup_list, key=lambda x: x[1][1], reverse=True)
+        dup_inds = [x[0] for x in dup_list]
+        sim_inds = [x[1][0] for x in dup_list]
+        sim_scores_dup = [x[1][1] for x in dup_list]
+
+        remain_inds = sorted(list(set(indices_unique.tolist()) - set(dup_inds)))
+
+        # save
+        meta_unique = dataset.meta.iloc[remain_inds]
+        meta_unique.to_csv(out_path_dedup, index=False)
+        print(f"New meta without duplication saved to '{out_path_dedup}'.")
+
+        meta_dup = dataset.meta.iloc[dup_inds].copy().reset_index(drop=True)
+        path_dup = dataset.meta.iloc[sim_inds]["path"].copy().reset_index(drop=True)
+        meta_dup["path_dup"] = path_dup
+        meta_dup["sim"] = sim_scores_dup
+        meta_dup.to_csv(out_path_dup, index=False)
+        print(f"New meta with duplicated samples saved to '{out_path_dup}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/matching/__init__.py b/tools/scoring/matching/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/scoring/matching/inference.py b/tools/scoring/matching/inference.py
new file mode 100644
index 0000000..70209eb
--- /dev/null
+++ b/tools/scoring/matching/inference.py
@@ -0,0 +1,138 @@
+import argparse
+import os
+
+import clip
+import colossalai
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, DistributedSampler
+from torchvision.datasets.folder import pil_loader
+from tqdm import tqdm
+
+from tools.datasets.utils import extract_frames, is_video
+
+
+def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
+    # reorder
+    indices_list = list(map(lambda x: x[0], gathered_list))
+    scores_list = list(map(lambda x: x[1], gathered_list))
+
+    flat_indices = []
+    for x in zip(*indices_list):
+        flat_indices.extend(x)
+    flat_scores = []
+    for x in zip(*scores_list):
+        flat_scores.extend(x)
+    flat_indices = np.array(flat_indices)
+    flat_scores = np.array(flat_scores)
+
+    # filter duplicates
+    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
+    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
+    return meta
+
+
+class VideoTextDataset(torch.utils.data.Dataset):
+    def __init__(self, meta_path, transform):
+        self.meta_path = meta_path
+        self.meta = pd.read_csv(meta_path)
+        self.transform = transform
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+        path = row["path"]
+
+        if is_video(path):
+            img = extract_frames(path, points=[0.5], backend="opencv")[0]
+        else:
+            img = pil_loader(path)
+
+        img = self.transform(img)
+
+        text = row["text"]
+        text = clip.tokenize(text, truncate=True).squeeze()
+
+        return img, text, index
+
+    def __len__(self):
+        return len(self.meta)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--bs", type=int, default=16, help="Batch size")
+    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
+    parser.add_argument("--skip_if_existing", action="store_true")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f"{wo_ext}_match{ext}"
+    if args.skip_if_existing and os.path.exists(out_path):
+        print(f"Output meta file '{out_path}' already exists. Exit.")
+        exit()
+
+    colossalai.launch_from_torch({})
+
+    # build model
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model, preprocess = clip.load("ViT-L/14", device=device)
+    logit_scale = model.logit_scale.exp().item()
+
+    # build dataset
+    dataset = VideoTextDataset(meta_path=meta_path, transform=preprocess)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.bs,
+        num_workers=args.num_workers,
+        sampler=DistributedSampler(
+            dataset,
+            num_replicas=dist.get_world_size(),
+            rank=dist.get_rank(),
+            shuffle=False,
+            drop_last=False,
+        ),
+    )
+
+    # compute scores
+    indices_list = []
+    scores_list = []
+    model.eval()
+    for imgs, text, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
+        imgs = imgs.to(device)
+        text = text.to(device)
+
+        with torch.no_grad():
+            feat_img = model.encode_image(imgs)
+            feat_text = model.encode_text(text)
+
+        feat_img = F.normalize(feat_img, dim=1)
+        feat_text = F.normalize(feat_text, dim=1)
+        clip_scores = logit_scale * (feat_img * feat_text).sum(dim=1)
+        clip_scores = clip_scores.cpu().tolist()
+        indices_list.extend(indices)
+        scores_list.extend(clip_scores)
+
+    gathered_list = [None] * dist.get_world_size()
+    dist.all_gather_object(gathered_list, (indices_list, scores_list))
+    if dist.get_rank() == 0:
+        meta_new = merge_scores(gathered_list, dataset.meta, column="match")
+        meta_new.to_csv(out_path, index=False)
+        print(f"New meta with matching scores saved to '{out_path}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/motion/README.md b/tools/scoring/motion/README.md
new file mode 100644
index 0000000..e8c28b7
--- /dev/null
+++ b/tools/scoring/motion/README.md
@@ -0,0 +1,12 @@
+# Installation
+
+Please install the VMAF and FFMPEG package to base environment.
+
+## VMAF
+
+For calculating motion score, vmaf score, we use VMAF with FFMPEG, please follow the installation guides [VMAF](https://github.com/Netflix/vmaf/blob/master/libvmaf/README.md#install) and [here](https://github.com/Netflix/vmaf/blob/master/resource/doc/ffmpeg.md) and install the required FFMPEG software with VMAF support. Not that you need to export the path to VMAF before installing FFMPEG.
+```
+export LD_LIBRARY_PATH=/usr/local/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+
+# change to the directory on your machine that contains libvmaf.so.3
+```
diff --git a/tools/scoring/motion/motion.py b/tools/scoring/motion/motion.py
new file mode 100644
index 0000000..d197571
--- /dev/null
+++ b/tools/scoring/motion/motion.py
@@ -0,0 +1,148 @@
+# Please follow the instructions in the README.md file to install dependency.
+
+import argparse
+import json
+import math
+import multiprocessing
+import os
+import shutil
+import subprocess
+
+import pandas as pd
+import tqdm
+
+# Constants
+TEMP_FOLDER = "/mnt/ddn/yeanbang/experiments/data_pipeline/temp_videos"
+DOCKER_IMAGE = "vmaf_im"
+EXTRACT_MVS_RUNNABLE_PATH = "~/extract_mvs"
+GET_VMAF_MOTION_SCORE = True
+GET_MOTION_VECTOR_AVERAGE = True
+
+VMAF_CMD = """ffmpeg \
+    -nostats -loglevel 0 \
+    -r 24 -i "$SRC_VIDEO$" \
+    -r 24 -i "$SRC_VIDEO$" \
+    -lavfi "[0:v]setpts=PTS-STARTPTS[reference]; \
+            [1:v]setpts=PTS-STARTPTS[distorted]; \
+            [distorted][reference]libvmaf=log_fmt=json:log_path=$OUTPUT_PATH$:n_threads=4" \
+    -f null -"""
+
+
+def create_temp_folder():
+    """Creates a temp folder if it doesn't exist."""
+    if not os.path.exists(TEMP_FOLDER):
+        os.makedirs(TEMP_FOLDER)
+
+
+def calculate_magnitude(mvx, mvy):
+    return math.sqrt(mvx**2 + mvy**2)
+
+
+def get_average_motion_vector_strength(motion_vector_log_file):
+    """Calculates the average strength of motion vectors in a log file."""
+
+    # Initialize variables to calculate the average
+    total_magnitude = 0
+    vector_count = 0
+    # Open and parse the log file
+    df = pd.read_csv(motion_vector_log_file)
+    motion_x = df["motion_x"]
+    motion_y = df["motion_y"]
+    # Calculate the magnitude of each motion vector
+    for i in range(len(motion_x)):
+        magnitude = calculate_magnitude(motion_x[i], motion_y[i])
+        total_magnitude += magnitude
+        vector_count += 1
+    if vector_count == 0:
+        return 0
+    return total_magnitude / vector_count
+
+
+def process_video(video_info):
+    """Processes a video using the Docker command."""
+
+    index, video_path = video_info
+    average_motion_vector_strength = 0.0
+    motion_score = 0.0
+    vmaf_score = 0.0
+
+    if GET_VMAF_MOTION_SCORE:
+        try:
+            vmaf_cmd = VMAF_CMD.replace("$SRC_VIDEO$", video_path).replace(
+                "$OUTPUT_PATH$", f"{TEMP_FOLDER}/output_{index}.json"
+            )
+            subprocess.run(vmaf_cmd, shell=True, check=True)
+        except Exception as e:
+            print(f"Error processing video {index}: {e}")
+            return index, motion_score, vmaf_score, average_motion_vector_strength
+        finally:
+            # Read the output JSON file
+            output_file = f"{TEMP_FOLDER}/output_{index}.json"
+            if os.path.exists(output_file):
+                with open(output_file) as f:
+                    data = json.load(f)
+                    # ref: https://video.stackexchange.com/questions/24210/how-should-i-interpret-the-results-from-netflix-vmaf
+                    motion_score = (
+                        data["pooled_metrics"]["integer_motion"]["mean"]
+                        + data["pooled_metrics"]["integer_motion2"]["mean"]
+                    ) / 2.0
+                    vmaf_score = data["pooled_metrics"]["vmaf"]["mean"]
+
+                os.remove(output_file)
+
+    if GET_MOTION_VECTOR_AVERAGE:
+        try:
+            command = f'{EXTRACT_MVS_RUNNABLE_PATH} "{video_path}" > {TEMP_FOLDER}/{index}_motion_vectors.csv'
+            subprocess.run(command, shell=True, check=True)
+            average_motion_vector_strength = get_average_motion_vector_strength(
+                f"{TEMP_FOLDER}/{index}_motion_vectors.csv"
+            )
+        except Exception as e:
+            print(f"Error processing video {index}: {e}")
+        finally:
+            if os.path.exists(f"{TEMP_FOLDER}/{index}_motion_vectors.csv"):
+                os.remove(f"{TEMP_FOLDER}/{index}_motion_vectors.csv")
+
+    return index, motion_score, vmaf_score, average_motion_vector_strength
+
+
+def process_videos_parallel(csv_file, num_processes=100):
+    """Processes videos in parallel."""
+    # Read CSV file
+    df = pd.read_csv(csv_file)
+
+    if "motion_score" not in df.columns and GET_VMAF_MOTION_SCORE:
+        df["motion_score"] = 0.0
+        df["vmaf_score"] = 0.0
+    if "average_motion_vector_strength" not in df.columns and GET_MOTION_VECTOR_AVERAGE:
+        df["average_motion_vector_strength"] = 0.0
+
+    # Create temp folder if it doesn't exist
+    create_temp_folder()
+
+    # Create a pool of workers to process videos in parallel
+
+    with multiprocessing.Pool(num_processes) as pool:
+        # Each worker processes one video
+        for result in list(tqdm.tqdm(pool.imap(process_video, enumerate(df["path"])), total=len(df))):
+            index, motion_score, vmaf_score, average_motion_vector_strength = result
+            # Update the dataframe with the motion score
+            if GET_VMAF_MOTION_SCORE:
+                df.at[index, "motion_score"] = motion_score
+                df.at[index, "vmaf_score"] = vmaf_score
+            if GET_MOTION_VECTOR_AVERAGE:
+                df.at[index, "average_motion_vector_strength"] = average_motion_vector_strength
+
+    # Remove temp folder after processing
+    if os.path.exists(TEMP_FOLDER):
+        shutil.rmtree(TEMP_FOLDER)
+
+    # Save the updated dataframe to a new CSV file
+    df.to_csv(csv_file, index=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--csv_file", type=str, required=True)
+    args = parser.parse_args()
+    process_videos_parallel(args.csv_file)
diff --git a/tools/scoring/ocr/__init__.py b/tools/scoring/ocr/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/scoring/ocr/dbnetpp.py b/tools/scoring/ocr/dbnetpp.py
new file mode 100644
index 0000000..7c64615
--- /dev/null
+++ b/tools/scoring/ocr/dbnetpp.py
@@ -0,0 +1,65 @@
+model = dict(
+    type="DBNet",
+    backbone=dict(
+        type="CLIPResNet",
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type="BN", requires_grad=True),
+        norm_eval=False,
+        style="pytorch",
+        dcn=dict(type="DCNv2", deform_groups=1, fallback_on_stride=False),
+        # init_cfg=dict(
+        #     type='Pretrained',
+        #     checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'),
+        stage_with_dcn=(False, True, True, True),
+    ),
+    neck=dict(
+        type="FPNC",
+        in_channels=[256, 512, 1024, 2048],
+        lateral_channels=256,
+        asf_cfg=dict(attention_type="ScaleChannelSpatial"),
+    ),
+    det_head=dict(
+        type="DBHead",
+        in_channels=256,
+        module_loss=dict(type="DBModuleLoss"),
+        postprocessor=dict(
+            type="DBPostprocessor",
+            text_repr_type="quad",
+            epsilon_ratio=0.002,
+        ),
+    ),
+    data_preprocessor=dict(
+        type="TextDetDataPreprocessor",
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32,
+    ),
+    init_cfg=dict(
+        type="Pretrained",
+        checkpoint="https://download.openmmlab.com/mmocr/textdet/dbnetpp/"
+        "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/"
+        "dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth",
+    ),
+)
+
+test_pipeline = [
+    # dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
+    dict(type="Resize", scale=(4068, 1024), keep_ratio=True),
+    dict(
+        type="PackTextDetInputs",
+        # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'),
+        meta_keys=("img_shape", "scale_factor"),
+    ),
+]
+
+# Visualization
+vis_backends = [dict(type="LocalVisBackend")]
+visualizer = dict(
+    type="TextDetLocalVisualizer",
+    name="visualizer",
+    vis_backends=vis_backends,
+)
diff --git a/tools/scoring/ocr/inference.py b/tools/scoring/ocr/inference.py
new file mode 100644
index 0000000..95a1737
--- /dev/null
+++ b/tools/scoring/ocr/inference.py
@@ -0,0 +1,158 @@
+import argparse
+import os
+
+import colossalai
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+from mmengine import Config
+from mmengine.dataset import Compose, default_collate
+from mmengine.registry import DefaultScope
+from mmocr.datasets import PackTextDetInputs
+from mmocr.registry import MODELS
+from torch.utils.data import DataLoader, DistributedSampler
+from torchvision.datasets.folder import pil_loader
+from torchvision.transforms import CenterCrop, Compose, Resize
+from tqdm import tqdm
+
+from tools.datasets.utils import extract_frames, is_video
+
+
+def merge_scores(gathered_list: list, meta: pd.DataFrame):
+    # reorder
+    indices_list = list(map(lambda x: x[0], gathered_list))
+    scores_list = list(map(lambda x: x[1], gathered_list))
+    flat_indices = []
+    for x in zip(*indices_list):
+        flat_indices.extend(x)
+    flat_scores = []
+    for x in zip(*scores_list):
+        flat_scores.extend(x)
+    flat_indices = np.array(flat_indices)
+    flat_scores = np.array(flat_scores)
+    # filter duplicates
+    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
+    meta.loc[unique_indices, "ocr"] = flat_scores[unique_indices_idx]
+
+
+class VideoTextDataset(torch.utils.data.Dataset):
+    def __init__(self, meta_path, transform):
+        self.meta_path = meta_path
+        self.meta = pd.read_csv(meta_path)
+        self.transform = transform
+        self.transform = Compose(
+            [
+                Resize(1024),
+                CenterCrop(1024),
+            ]
+        )
+        self.formatting = PackTextDetInputs(meta_keys=["scale_factor"])
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+        path = row["path"]
+
+        if is_video(path):
+            img = extract_frames(path, frame_inds=[10], backend="opencv")[0]
+        else:
+            img = pil_loader(path)
+
+        img = self.transform(img)
+        img_array = np.array(img)[:, :, ::-1].copy()  # bgr
+        results = {
+            "img": img_array,
+            "scale_factor": 1.0,
+            # 'img_shape': img_array.shape[-2],
+            # 'ori_shape': img_array.shape[-2],
+        }
+        results = self.formatting(results)
+        results["index"] = index
+
+        return results
+
+    def __len__(self):
+        return len(self.meta)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--bs", type=int, default=16, help="Batch size")
+    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
+    parser.add_argument("--skip_if_existing", action="store_true")
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f"{wo_ext}_ocr{ext}"
+    if args.skip_if_existing and os.path.exists(out_path):
+        print(f"Output meta file '{out_path}' already exists. Exit.")
+        exit()
+
+    cfg = Config.fromfile("./tools/scoring/ocr/dbnetpp.py")
+    colossalai.launch_from_torch({})
+
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    DefaultScope.get_instance("ocr", scope_name="mmocr")  # use mmocr Registry as default
+
+    # build model
+    model = MODELS.build(cfg.model)
+    model.init_weights()
+    model.to(device)  # set data_preprocessor._device
+    print("==> Model built.")
+
+    # build dataset
+    transform = Compose(cfg.test_pipeline)
+    dataset = VideoTextDataset(meta_path=meta_path, transform=transform)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.bs,
+        num_workers=args.num_workers,
+        sampler=DistributedSampler(
+            dataset,
+            num_replicas=dist.get_world_size(),
+            rank=dist.get_rank(),
+            shuffle=False,
+            drop_last=False,
+        ),
+        collate_fn=default_collate,
+    )
+    print("==> Dataloader built.")
+
+    # compute scores
+    dataset.meta["ocr"] = np.nan
+    indices_list = []
+    scores_list = []
+    model.eval()
+    for data in tqdm(dataloader, disable=dist.get_rank() != 0):
+        indices_i = data["index"]
+        indices_list.extend(indices_i.tolist())
+        del data["index"]
+
+        pred = model.test_step(data)  # this line will cast data to device
+
+        num_texts_i = [(x.pred_instances.scores > 0.3).sum().item() for x in pred]
+        scores_list.extend(num_texts_i)
+
+    gathered_list = [None] * dist.get_world_size()
+    dist.all_gather_object(gathered_list, (indices_list, scores_list))
+
+    if dist.get_rank() == 0:
+        merge_scores(gathered_list, dataset.meta)
+        dataset.meta.to_csv(out_path, index=False)
+        print(f"New meta (shape={dataset.meta.shape}) with ocr results saved to '{out_path}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/optical_flow/__init__.py b/tools/scoring/optical_flow/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/scoring/optical_flow/detect_zoom.py b/tools/scoring/optical_flow/detect_zoom.py
new file mode 100644
index 0000000..7dafb08
--- /dev/null
+++ b/tools/scoring/optical_flow/detect_zoom.py
@@ -0,0 +1,235 @@
+"""
+detect zoom in/zoom out videos
+"""
+
+import argparse
+
+# cv2.setNumThreads(1)
+from functools import partial
+
+import cv2
+import pandas as pd
+import scipy.stats as stats
+from pandarallel import pandarallel
+from tqdm import tqdm
+
+# thresholds
+max_frames = 5  # max number of times to run correlation analysis
+p_threshold = 0.1
+tau_threshold = 0.50  # correlation above which to determine as zoom
+crop_ratio = 2 / 3
+zoom_frame_count_threshold = 0.4  # % of frames above which has zoom
+image_size = (256, 256)
+min_corr_counts = 10  # minimum number of points required to calc correlation
+sample_interval = 10  # need to be larger than 10 to be accurate
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
+    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
+    parser.add_argument("--num_samples", type=int, default=-1, help="number of samples to process, for quick tests")
+
+    args = parser.parse_args()
+    return args
+
+
+def crop_resize(image, image_size):
+    # crop the edge to remove watermarks
+    h, w = image.shape
+    crop_h, crop_w = int(h * crop_ratio), int(w * crop_ratio)
+    start_h = (h - crop_h) // 2
+    start_w = (w - crop_w) // 2
+    image = image[start_h : start_h + crop_h, start_w : start_w + crop_w]
+
+    # resize
+    h, w = image.shape
+    h_t, w_t = image_size
+    scale = max(h_t / h, w_t / w)
+    resized_image = cv2.resize(image, (int(w * scale), int(h * scale)))  # NOTE: strage but cv2 need to put w h
+
+    # center crop
+    h, w = resized_image.shape[:2]
+    # Calculate the center crop dimensions
+    crop_h, crop_w = image_size
+    start_h = (h - crop_h) // 2
+    start_w = (w - crop_w) // 2
+    center_cropped_image = resized_image[start_h : start_h + crop_h, start_w : start_w + crop_w]
+
+    return center_cropped_image
+
+
+def is_dynamic_video(
+    video_path,
+):
+    cap = cv2.VideoCapture(video_path)
+
+    # Check if the video opened successfully
+    if not cap.isOpened():
+        print(f"Error: Could not open video '{video_path}'")
+        return False
+
+    frame_index = -1
+
+    p0 = None
+    while p0 is None:
+        frame_index += 1
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+        success, frame = cap.read()
+        if not success:
+            print(f"Warning: Failed to read frame {frame_index} of video '{video_path}'")
+            break
+
+        prev_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        prev_gray = crop_resize(prev_gray, image_size)
+
+        # Feature parameters
+        feature_params = dict(maxCorners=100, qualityLevel=0.3, minDistance=7, blockSize=7)
+        p0 = cv2.goodFeaturesToTrack(prev_gray, mask=None, **feature_params)
+    if p0 is None:
+        return True
+
+    # Calculate flow
+    # Parameters for optical flow
+    lk_params = dict(winSize=(15, 15), maxLevel=2, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
+
+    zoom_count = 0
+    analyzed_count = 0
+    invalid_count = 0
+
+    while analyzed_count < max_frames:
+        if invalid_count >= 3:
+            break
+
+        frame_index += sample_interval
+
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+        success, frame = cap.read()
+        if not success:
+            break
+
+        # Convert to grayscale
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        gray = crop_resize(gray, image_size)
+
+        # Calculate optical flow
+        p1, status, err = cv2.calcOpticalFlowPyrLK(prev_gray, gray, p0, None, **lk_params)
+
+        if p1 is None:
+            invalid_count += 1
+            continue
+
+        # Select good points
+        good_new = p1[status == 1]  # status == 1: found flow; status == 0: didn't
+        good_old = p0[status == 1]
+
+        # Calculate motion vectors
+        motion_vectors = good_new - good_old
+
+        motion_x_neg = []
+        x_neg = []
+        motion_x_pos = []
+        x_pos = []
+        motion_y_neg = []
+        y_neg = []
+        motion_y_pos = []
+        y_pos = []
+
+        for i, (motion_x, motion_y) in enumerate(motion_vectors):
+            if motion_x < 0:
+                motion_x_neg.append(motion_x)
+                x_neg.append(good_old[i][0])
+            else:
+                motion_x_pos.append(motion_x)
+                x_pos.append(good_old[i][0])
+
+            if motion_y < 0:
+                motion_y_neg.append(motion_y)
+                y_neg.append(good_old[i][1])
+            else:
+                motion_y_pos.append(motion_y)
+                y_pos.append(good_old[i][1])
+
+        is_zoom = True
+        corr_checks = 0
+
+        if len(motion_x_neg) > min_corr_counts:
+            corr_checks += 1
+            tau_x_neg, p_x_neg = stats.kendalltau(motion_x_neg, x_neg)
+            if p_x_neg > p_threshold or abs(tau_x_neg) < tau_threshold:
+                is_zoom = False
+        if len(motion_x_pos) > min_corr_counts:
+            corr_checks += 1
+            tau_x_pos, p_x_pos = stats.kendalltau(motion_x_pos, x_pos)
+            if p_x_pos > p_threshold or abs(tau_x_pos) < tau_threshold:
+                is_zoom = False
+        if len(motion_y_neg) > min_corr_counts:
+            corr_checks += 1
+            tau_y_neg, p_y_neg = stats.kendalltau(motion_y_neg, y_neg)
+            if p_y_neg > p_threshold or abs(tau_y_neg) < tau_threshold:
+                is_zoom = False
+        if len(motion_y_pos) > min_corr_counts:
+            corr_checks += 1
+            tau_y_pos, p_y_pos = stats.kendalltau(motion_y_pos, y_pos)
+            if p_y_pos > p_threshold or abs(tau_y_pos) < tau_threshold:
+                is_zoom = False
+
+        if corr_checks > 0:
+            if is_zoom:
+                zoom_count += 1
+            analyzed_count += 1
+        else:
+            invalid_count += 1
+
+        # Update previous frame and points
+        prev_gray = gray.copy()
+        p0 = good_new.reshape(-1, 1, 2)
+
+    cap.release()
+
+    if analyzed_count > 2 and zoom_count / analyzed_count >= zoom_frame_count_threshold:
+        return False
+    else:
+        return True
+
+
+def process_single_row(row, args):
+    path = row["path"]
+    return is_dynamic_video(path)
+
+
+def main():
+    args = parse_args()
+    meta_path = args.meta_path
+
+    tqdm.pandas()
+    if not args.disable_parallel:
+        if args.num_workers is not None:
+            pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
+        else:
+            pandarallel.initialize(progress_bar=True)
+    process_single_row_partial = partial(process_single_row, args=args)
+
+    meta = pd.read_csv(meta_path)
+    if args.num_samples > -1:
+        meta = meta[: args.num_samples]
+
+    if not args.disable_parallel:
+        ret = meta.parallel_apply(process_single_row_partial, axis=1)
+    else:
+        ret = meta.progress_apply(process_single_row_partial, axis=1)
+    meta0 = meta[ret]
+    meta1 = meta[~ret]
+
+    out_path = meta_path.replace(".csv", "_filter-zoom.csv")
+    meta0.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta0.shape}) saved to '{out_path}'")
+
+    out_path = meta_path.replace(".csv", "_zoom.csv")
+    meta1.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta1.shape}) saved to '{out_path}'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/optical_flow/detect_zoom_av.py b/tools/scoring/optical_flow/detect_zoom_av.py
new file mode 100644
index 0000000..4edc445
--- /dev/null
+++ b/tools/scoring/optical_flow/detect_zoom_av.py
@@ -0,0 +1,246 @@
+"""
+detect zoom in/zoom out videos
+This file is deprecated: open video with av, inaccurate frames
+"""
+
+import argparse
+
+# cv2.setNumThreads(1)
+from functools import partial
+
+import cv2
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
+from pandarallel import pandarallel
+from tqdm import tqdm
+
+from tools.datasets.utils import extract_frames
+
+# hyper-parameters
+max_frames = 5  # max number of times to run correlation analysis
+p_threshold = 0.1
+tau_threshold = 0.60  # correlation above which to determine as zoom
+crop_ratio = 2 / 3
+zoom_frame_count_threshold = 0.4  # % of frames above which has zoom
+image_size = (256, 256)
+min_corr_counts = 10  # minimum number of points required to calc correlation
+frame_interval = 10  # need to be larger than 10 to be accurate
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
+    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
+    parser.add_argument("--num_samples", type=int, default=-1, help="number of samples to process, for quick tests")
+
+    args = parser.parse_args()
+    return args
+
+
+def crop_resize(image, image_size):
+    # crop the edge to remove watermarks
+    h, w = image.shape
+    crop_h, crop_w = int(h * crop_ratio), int(w * crop_ratio)
+    start_h = (h - crop_h) // 2
+    start_w = (w - crop_w) // 2
+    image = image[start_h : start_h + crop_h, start_w : start_w + crop_w]
+
+    # resize
+    h, w = image.shape
+    h_t, w_t = image_size
+    scale = max(h_t / h, w_t / w)
+    resized_image = cv2.resize(image, (int(w * scale), int(h * scale)))  # NOTE: strage but cv2 need to put w h
+
+    # center crop
+    h, w = resized_image.shape[:2]
+    # Calculate the center crop dimensions
+    crop_h, crop_w = image_size
+    start_h = (h - crop_h) // 2
+    start_w = (w - crop_w) // 2
+    center_cropped_image = resized_image[start_h : start_h + crop_h, start_w : start_w + crop_w]
+
+    return center_cropped_image
+
+
+def is_not_zoom_video(
+    video_path,
+):
+    interval_first = 5
+    try:
+        first_frames = extract_frames(
+            video_path,
+            frame_inds=list(range(0, 5 * interval_first + 1, interval_first)),
+            backend="av",
+        )
+    except:
+        return False
+
+    # fine feature to track
+    p0 = None
+    feature_params = dict(maxCorners=100, qualityLevel=0.3, minDistance=7, blockSize=7)
+    for idx, frame in enumerate(first_frames):
+        frame_np = np.array(frame)
+
+        prev_gray = cv2.cvtColor(frame_np, cv2.COLOR_RGB2GRAY)
+        prev_gray = crop_resize(prev_gray, image_size)
+
+        # Feature parameters
+        p0 = cv2.goodFeaturesToTrack(prev_gray, mask=None, **feature_params)  # [N, 1, 2]
+        if p0 is not None:
+            break
+    if p0 is None:
+        return False
+
+    # calculate flow at a regular interval
+    zoom_count = 0
+    analyzed_count = 0
+    invalid_counts = 0
+    # Parameters for optical flow
+    lk_params = dict(winSize=(15, 15), maxLevel=2, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
+
+    frame_inds = list(
+        range(
+            idx * interval_first + frame_interval,
+            idx * interval_first + max_frames * frame_interval + 1,
+            frame_interval,
+        )
+    )
+    frames = extract_frames(
+        video_path,
+        frame_inds=frame_inds,
+        backend="av",
+    )  # TODO: non-precise frames
+    for idx, frame in enumerate(frames):
+        # Convert to grayscale
+        frame_np = np.array(frame)
+        gray = cv2.cvtColor(frame_np, cv2.COLOR_RGB2GRAY)
+        gray = crop_resize(gray, image_size)
+
+        # Calculate optical flow
+        p1, st, err = cv2.calcOpticalFlowPyrLK(prev_gray, gray, p0, None, **lk_params)
+
+        if invalid_counts >= 3:
+            break
+        if p1 is None:
+            invalid_counts += 1
+            continue
+
+        # Select good points
+        good_new = p1[st == 1]
+        good_old = p0[st == 1]
+
+        # Calculate motion vectors
+        motion_vectors = good_new - good_old
+
+        motion_x_neg = []
+        x_neg = []
+        motion_x_pos = []
+        x_pos = []
+        motion_y_neg = []
+        y_neg = []
+        motion_y_pos = []
+        y_pos = []
+
+        for i, (motion_x, motion_y) in enumerate(motion_vectors):
+            if motion_x < 0:
+                motion_x_neg.append(motion_x)
+                x_neg.append(good_old[i][0])
+            else:
+                motion_x_pos.append(motion_x)
+                x_pos.append(good_old[i][0])
+
+            if motion_y < 0:
+                motion_y_neg.append(motion_y)
+                y_neg.append(good_old[i][1])
+            else:
+                motion_y_pos.append(motion_y)
+                y_pos.append(good_old[i][1])
+
+        is_zoom = True
+        corr_checks = 0
+
+        if len(motion_x_neg) > min_corr_counts:
+            corr_checks += 1
+            tau_x_neg, p_x_neg = stats.kendalltau(motion_x_neg, x_neg)
+            if p_x_neg > p_threshold or abs(tau_x_neg) < tau_threshold:
+                is_zoom = False
+        if len(motion_x_pos) > min_corr_counts:
+            corr_checks += 1
+            tau_x_pos, p_x_pos = stats.kendalltau(motion_x_pos, x_pos)
+            if p_x_pos > p_threshold or abs(tau_x_pos) < tau_threshold:
+                is_zoom = False
+        if len(motion_y_neg) > min_corr_counts:
+            corr_checks += 1
+            tau_y_neg, p_y_neg = stats.kendalltau(motion_y_neg, y_neg)
+            if p_y_neg > p_threshold or abs(tau_y_neg) < tau_threshold:
+                is_zoom = False
+        if len(motion_y_pos) > min_corr_counts:
+            corr_checks += 1
+            tau_y_pos, p_y_pos = stats.kendalltau(motion_y_pos, y_pos)
+            if p_y_pos > p_threshold or abs(tau_y_pos) < tau_threshold:
+                is_zoom = False
+
+        if corr_checks > 0:
+            if is_zoom:
+                zoom_count += 1
+            analyzed_count += 1
+        else:
+            invalid_counts += 1
+
+        # Update previous frame and points
+        prev_gray = gray.copy()
+        p0 = good_new.reshape(-1, 1, 2)
+
+    if analyzed_count > 2 and zoom_count / analyzed_count >= zoom_frame_count_threshold:
+        return False
+    else:
+        return True
+
+
+def process_single_row(row, args):
+    path = row["path"]
+    return is_not_zoom_video(path)
+
+
+def main():
+    args = parse_args()
+    meta_path = args.meta_path
+
+    # # time test
+    # start_time = time.time()
+
+    tqdm.pandas()
+    if not args.disable_parallel:
+        if args.num_workers is not None:
+            pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
+        else:
+            pandarallel.initialize(progress_bar=True)
+    process_single_row_partial = partial(process_single_row, args=args)
+
+    meta = pd.read_csv(meta_path)
+    if args.num_samples > -1:
+        meta = meta[: args.num_samples]
+
+    if not args.disable_parallel:
+        ret = meta.parallel_apply(process_single_row_partial, axis=1)
+    else:
+        ret = meta.progress_apply(process_single_row_partial, axis=1)
+    meta0 = meta[ret]
+    meta1 = meta[~ret]
+
+    out_path = meta_path.replace(".csv", "_filter-zoom.csv")
+    meta0.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta0.shape}) saved to '{out_path}'")
+
+    out_path = meta_path.replace(".csv", "_zoom.csv")
+    meta1.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta1.shape}) saved to '{out_path}'")
+
+    # # time test
+    # print("execution time:", time.time() - start_time)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/optical_flow/inference.py b/tools/scoring/optical_flow/inference.py
new file mode 100644
index 0000000..057e1f3
--- /dev/null
+++ b/tools/scoring/optical_flow/inference.py
@@ -0,0 +1,246 @@
+import cv2  # isort:skip
+
+import argparse
+import gc
+import os
+from datetime import timedelta
+
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from einops import rearrange
+from PIL import Image
+from torch.utils.data import DataLoader, DistributedSampler
+from torchvision.transforms.functional import pil_to_tensor
+from tqdm import tqdm
+
+# from tools.datasets.utils import extract_frames
+from tools.scoring.optical_flow.unimatch import UniMatch
+
+# torch.backends.cudnn.enabled = False # This line enables large batch, but the speed is similar
+
+
+def extract_frames(
+    video_path,
+    frame_inds=None,
+    points=None,
+    backend="opencv",
+    return_length=False,
+    num_frames=None,
+):
+    """
+    Args:
+        video_path (str): path to video
+        frame_inds (List[int]): indices of frames to extract
+        points (List[float]): values within [0, 1); multiply #frames to get frame indices
+    Return:
+        List[PIL.Image]
+    """
+    assert backend in ["av", "opencv", "decord"]
+    assert (frame_inds is None) or (points is None)
+    assert backend == "opencv"
+
+    cap = cv2.VideoCapture(video_path)
+    if num_frames is not None:
+        total_frames = num_frames
+    else:
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    if points is not None:
+        frame_inds = [int(p * total_frames) for p in points]
+
+    frames = []
+    for idx in frame_inds:
+        if idx >= total_frames:
+            idx = total_frames - 1
+
+        success = cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        if not success:
+            break
+
+        try:
+            ret, frame = cap.read()
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            frames.append(frame)
+        except Exception:
+            continue
+
+    if return_length:
+        return frames, total_frames
+    return frames
+
+
+def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
+    # reorder
+    indices_list = list(map(lambda x: x[0], gathered_list))
+    scores_list = list(map(lambda x: x[1], gathered_list))
+
+    flat_indices = []
+    for x in zip(*indices_list):
+        flat_indices.extend(x)
+    flat_scores = []
+    for x in zip(*scores_list):
+        flat_scores.extend(x)
+    flat_indices = np.array(flat_indices)
+    flat_scores = np.array(flat_scores)
+
+    # filter duplicates
+    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
+    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
+
+    # drop indices in meta not in unique_indices
+    meta = meta.loc[unique_indices]
+    return meta
+
+
+class VideoTextDataset(torch.utils.data.Dataset):
+    def __init__(self, meta_path, frame_inds=None):
+        self.meta_path = meta_path
+        self.meta = pd.read_csv(meta_path)
+        self.frame_inds = frame_inds
+
+    def __getitem__(self, index):
+        sample = self.meta.iloc[index]
+        path = sample["path"]
+
+        # extract frames
+        images = extract_frames(path, frame_inds=self.frame_inds, backend="opencv")
+
+        # transform
+        images = torch.stack([pil_to_tensor(x) for x in images])
+
+        # stack
+        # shape: [N, C, H, W]; dtype: torch.uint8
+        images = images.float()
+        H, W = images.shape[-2:]
+        if H > W:
+            images = rearrange(images, "N C H W -> N C W H")
+        images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)
+
+        ret = dict(index=index, images=images)
+        return ret
+
+    def __len__(self):
+        return len(self.meta)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--bs", type=int, default=1, help="Batch size")  # don't use too large bs for unimatch
+    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
+    parser.add_argument("--skip_if_existing", action="store_true")
+    args = parser.parse_args()
+    return args
+
+
+@torch.no_grad()
+def main():
+    args = parse_args()
+
+    meta_path = args.meta_path
+    if not os.path.exists(meta_path):
+        print(f"Meta file '{meta_path}' not found. Exit.")
+        exit()
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f"{wo_ext}_flow{ext}"
+    if args.skip_if_existing and os.path.exists(out_path):
+        print(f"Output meta file '{out_path}' already exists. Exit.")
+        exit()
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
+    torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
+
+    # build model
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = UniMatch(
+        feature_channels=128,
+        num_scales=2,
+        upsample_factor=4,
+        num_head=1,
+        ffn_dim_expansion=4,
+        num_transformer_layers=6,
+        reg_refine=True,
+        task="flow",
+    )
+    ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
+    model.load_state_dict(ckpt["model"])
+    model = model.to(device)
+
+    # build dataset
+    NUM_FRAMES = 10
+    frames_inds = [15 * i for i in range(0, NUM_FRAMES)]
+    dataset = VideoTextDataset(meta_path=meta_path, frame_inds=frames_inds)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.bs,
+        num_workers=args.num_workers,
+        sampler=DistributedSampler(
+            dataset,
+            num_replicas=dist.get_world_size(),
+            rank=dist.get_rank(),
+            shuffle=False,
+            drop_last=False,
+        ),
+    )
+
+    # compute optical flow scores
+    indices_list = []
+    scores_list = []
+    model.eval()
+    for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
+        indices = batch["index"]
+        images = batch["images"].to(device)
+
+        B = images.shape[0]
+        batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
+        batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()
+
+        res = model(
+            batch_0,
+            batch_1,
+            attn_type="swin",
+            attn_splits_list=[2, 8],
+            corr_radius_list=[-1, 4],
+            prop_radius_list=[-1, 1],
+            num_reg_refine=6,
+            task="flow",
+            pred_bidir_flow=False,
+        )
+        flow_maps = res["flow_preds"][-1]  # [B * (N-1), 2, H, W]
+        flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B)
+        flow_scores = flow_maps.norm(dim=-1).mean(dim=[1, 2, 3]).cpu()
+
+        indices_list.extend(indices.tolist())
+        scores_list.extend(flow_scores.tolist())
+
+    # save local results
+    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="flow")
+    save_dir_local = os.path.join(os.path.dirname(out_path), "parts")
+    os.makedirs(save_dir_local, exist_ok=True)
+    out_path_local = os.path.join(
+        save_dir_local, os.path.basename(out_path).replace(".csv", f"_part_{dist.get_rank()}.csv")
+    )
+    meta_local.to_csv(out_path_local, index=False)
+
+    # wait for all ranks to finish data processing
+    dist.barrier()
+
+    torch.cuda.empty_cache()
+    gc.collect()
+    gathered_list = [None] * dist.get_world_size()
+    dist.all_gather_object(gathered_list, (indices_list, scores_list))
+    if dist.get_rank() == 0:
+        meta_new = merge_scores(gathered_list, dataset.meta, column="flow")
+        meta_new.to_csv(out_path, index=False)
+        print(f"New meta with optical flow scores saved to '{out_path}'.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/scoring/optical_flow/unimatch/__init__.py b/tools/scoring/optical_flow/unimatch/__init__.py
new file mode 100644
index 0000000..c1f4eb2
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/__init__.py
@@ -0,0 +1 @@
+from .unimatch import UniMatch
diff --git a/tools/scoring/optical_flow/unimatch/attention.py b/tools/scoring/optical_flow/unimatch/attention.py
new file mode 100644
index 0000000..23fb904
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/attention.py
@@ -0,0 +1,280 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .utils import merge_splits, merge_splits_1d, split_feature, split_feature_1d
+
+
+def single_head_full_attention(q, k, v):
+    # q, k, v: [B, L, C]
+    assert q.dim() == k.dim() == v.dim() == 3
+
+    scores = torch.matmul(q, k.permute(0, 2, 1)) / (q.size(2) ** 0.5)  # [B, L, L]
+    attn = torch.softmax(scores, dim=2)  # [B, L, L]
+    out = torch.matmul(attn, v)  # [B, L, C]
+
+    return out
+
+
+def single_head_full_attention_1d(
+    q,
+    k,
+    v,
+    h=None,
+    w=None,
+):
+    # q, k, v: [B, L, C]
+
+    assert h is not None and w is not None
+    assert q.size(1) == h * w
+
+    b, _, c = q.size()
+
+    q = q.view(b, h, w, c)  # [B, H, W, C]
+    k = k.view(b, h, w, c)
+    v = v.view(b, h, w, c)
+
+    scale_factor = c**0.5
+
+    scores = torch.matmul(q, k.permute(0, 1, 3, 2)) / scale_factor  # [B, H, W, W]
+
+    attn = torch.softmax(scores, dim=-1)
+
+    out = torch.matmul(attn, v).view(b, -1, c)  # [B, H*W, C]
+
+    return out
+
+
+def single_head_split_window_attention(
+    q,
+    k,
+    v,
+    num_splits=1,
+    with_shift=False,
+    h=None,
+    w=None,
+    attn_mask=None,
+):
+    # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+    # q, k, v: [B, L, C]
+    assert q.dim() == k.dim() == v.dim() == 3
+
+    assert h is not None and w is not None
+    assert q.size(1) == h * w
+
+    b, _, c = q.size()
+
+    b_new = b * num_splits * num_splits
+
+    window_size_h = h // num_splits
+    window_size_w = w // num_splits
+
+    q = q.view(b, h, w, c)  # [B, H, W, C]
+    k = k.view(b, h, w, c)
+    v = v.view(b, h, w, c)
+
+    scale_factor = c**0.5
+
+    if with_shift:
+        assert attn_mask is not None  # compute once
+        shift_size_h = window_size_h // 2
+        shift_size_w = window_size_w // 2
+
+        q = torch.roll(q, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+        k = torch.roll(k, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+        v = torch.roll(v, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+
+    q = split_feature(q, num_splits=num_splits, channel_last=True)  # [B*K*K, H/K, W/K, C]
+    k = split_feature(k, num_splits=num_splits, channel_last=True)
+    v = split_feature(v, num_splits=num_splits, channel_last=True)
+
+    scores = (
+        torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor
+    )  # [B*K*K, H/K*W/K, H/K*W/K]
+
+    if with_shift:
+        scores += attn_mask.repeat(b, 1, 1)
+
+    attn = torch.softmax(scores, dim=-1)
+
+    out = torch.matmul(attn, v.view(b_new, -1, c))  # [B*K*K, H/K*W/K, C]
+
+    out = merge_splits(
+        out.view(b_new, h // num_splits, w // num_splits, c), num_splits=num_splits, channel_last=True
+    )  # [B, H, W, C]
+
+    # shift back
+    if with_shift:
+        out = torch.roll(out, shifts=(shift_size_h, shift_size_w), dims=(1, 2))
+
+    out = out.view(b, -1, c)
+
+    return out
+
+
+def single_head_split_window_attention_1d(
+    q,
+    k,
+    v,
+    relative_position_bias=None,
+    num_splits=1,
+    with_shift=False,
+    h=None,
+    w=None,
+    attn_mask=None,
+):
+    # q, k, v: [B, L, C]
+
+    assert h is not None and w is not None
+    assert q.size(1) == h * w
+
+    b, _, c = q.size()
+
+    b_new = b * num_splits * h
+
+    window_size_w = w // num_splits
+
+    q = q.view(b * h, w, c)  # [B*H, W, C]
+    k = k.view(b * h, w, c)
+    v = v.view(b * h, w, c)
+
+    scale_factor = c**0.5
+
+    if with_shift:
+        assert attn_mask is not None  # compute once
+        shift_size_w = window_size_w // 2
+
+        q = torch.roll(q, shifts=-shift_size_w, dims=1)
+        k = torch.roll(k, shifts=-shift_size_w, dims=1)
+        v = torch.roll(v, shifts=-shift_size_w, dims=1)
+
+    q = split_feature_1d(q, num_splits=num_splits)  # [B*H*K, W/K, C]
+    k = split_feature_1d(k, num_splits=num_splits)
+    v = split_feature_1d(v, num_splits=num_splits)
+
+    scores = (
+        torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1)) / scale_factor
+    )  # [B*H*K, W/K, W/K]
+
+    if with_shift:
+        # attn_mask: [K, W/K, W/K]
+        scores += attn_mask.repeat(b * h, 1, 1)  # [B*H*K, W/K, W/K]
+
+    attn = torch.softmax(scores, dim=-1)
+
+    out = torch.matmul(attn, v.view(b_new, -1, c))  # [B*H*K, W/K, C]
+
+    out = merge_splits_1d(out, h, num_splits=num_splits)  # [B, H, W, C]
+
+    # shift back
+    if with_shift:
+        out = torch.roll(out, shifts=shift_size_w, dims=2)
+
+    out = out.view(b, -1, c)
+
+    return out
+
+
+class SelfAttnPropagation(nn.Module):
+    """
+    flow propagation with self-attention on feature
+    query: feature0, key: feature0, value: flow
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        **kwargs,
+    ):
+        super(SelfAttnPropagation, self).__init__()
+
+        self.q_proj = nn.Linear(in_channels, in_channels)
+        self.k_proj = nn.Linear(in_channels, in_channels)
+
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(
+        self,
+        feature0,
+        flow,
+        local_window_attn=False,
+        local_window_radius=1,
+        **kwargs,
+    ):
+        # q, k: feature [B, C, H, W], v: flow [B, 2, H, W]
+        if local_window_attn:
+            return self.forward_local_window_attn(feature0, flow, local_window_radius=local_window_radius)
+
+        b, c, h, w = feature0.size()
+
+        query = feature0.view(b, c, h * w).permute(0, 2, 1)  # [B, H*W, C]
+
+        # a note: the ``correct'' implementation should be:
+        # ``query = self.q_proj(query), key = self.k_proj(query)''
+        # this problem is observed while cleaning up the code
+        # however, this doesn't affect the performance since the projection is a linear operation,
+        # thus the two projection matrices for key can be merged
+        # so I just leave it as is in order to not re-train all models :)
+        query = self.q_proj(query)  # [B, H*W, C]
+        key = self.k_proj(query)  # [B, H*W, C]
+
+        value = flow.view(b, flow.size(1), h * w).permute(0, 2, 1)  # [B, H*W, 2]
+
+        scores = torch.matmul(query, key.permute(0, 2, 1)) / (c**0.5)  # [B, H*W, H*W]
+        prob = torch.softmax(scores, dim=-1)
+
+        out = torch.matmul(prob, value)  # [B, H*W, 2]
+        out = out.view(b, h, w, value.size(-1)).permute(0, 3, 1, 2)  # [B, 2, H, W]
+
+        return out
+
+    def forward_local_window_attn(
+        self,
+        feature0,
+        flow,
+        local_window_radius=1,
+    ):
+        assert flow.size(1) == 2 or flow.size(1) == 1  # flow or disparity or depth
+        assert local_window_radius > 0
+
+        b, c, h, w = feature0.size()
+
+        value_channel = flow.size(1)
+
+        feature0_reshape = self.q_proj(feature0.view(b, c, -1).permute(0, 2, 1)).reshape(
+            b * h * w, 1, c
+        )  # [B*H*W, 1, C]
+
+        kernel_size = 2 * local_window_radius + 1
+
+        feature0_proj = self.k_proj(feature0.view(b, c, -1).permute(0, 2, 1)).permute(0, 2, 1).reshape(b, c, h, w)
+
+        feature0_window = F.unfold(
+            feature0_proj, kernel_size=kernel_size, padding=local_window_radius
+        )  # [B, C*(2R+1)^2), H*W]
+
+        feature0_window = (
+            feature0_window.view(b, c, kernel_size**2, h, w)
+            .permute(0, 3, 4, 1, 2)
+            .reshape(b * h * w, c, kernel_size**2)
+        )  # [B*H*W, C, (2R+1)^2]
+
+        flow_window = F.unfold(flow, kernel_size=kernel_size, padding=local_window_radius)  # [B, 2*(2R+1)^2), H*W]
+
+        flow_window = (
+            flow_window.view(b, value_channel, kernel_size**2, h, w)
+            .permute(0, 3, 4, 2, 1)
+            .reshape(b * h * w, kernel_size**2, value_channel)
+        )  # [B*H*W, (2R+1)^2, 2]
+
+        scores = torch.matmul(feature0_reshape, feature0_window) / (c**0.5)  # [B*H*W, 1, (2R+1)^2]
+
+        prob = torch.softmax(scores, dim=-1)
+
+        out = (
+            torch.matmul(prob, flow_window).view(b, h, w, value_channel).permute(0, 3, 1, 2).contiguous()
+        )  # [B, 2, H, W]
+
+        return out
diff --git a/tools/scoring/optical_flow/unimatch/backbone.py b/tools/scoring/optical_flow/unimatch/backbone.py
new file mode 100644
index 0000000..5c2cc19
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/backbone.py
@@ -0,0 +1,128 @@
+import torch.nn as nn
+
+from .trident_conv import MultiScaleTridentConv
+
+
+class ResidualBlock(nn.Module):
+    def __init__(
+        self,
+        in_planes,
+        planes,
+        norm_layer=nn.InstanceNorm2d,
+        stride=1,
+        dilation=1,
+    ):
+        super(ResidualBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, dilation=dilation, padding=dilation, stride=stride, bias=False
+        )
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, dilation=dilation, padding=dilation, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.norm1 = norm_layer(planes)
+        self.norm2 = norm_layer(planes)
+        if not stride == 1 or in_planes != planes:
+            self.norm3 = norm_layer(planes)
+
+        if stride == 1 and in_planes == planes:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class CNNEncoder(nn.Module):
+    def __init__(
+        self,
+        output_dim=128,
+        norm_layer=nn.InstanceNorm2d,
+        num_output_scales=1,
+        **kwargs,
+    ):
+        super(CNNEncoder, self).__init__()
+        self.num_branch = num_output_scales
+
+        feature_dims = [64, 96, 128]
+
+        self.conv1 = nn.Conv2d(3, feature_dims[0], kernel_size=7, stride=2, padding=3, bias=False)  # 1/2
+        self.norm1 = norm_layer(feature_dims[0])
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = feature_dims[0]
+        self.layer1 = self._make_layer(feature_dims[0], stride=1, norm_layer=norm_layer)  # 1/2
+        self.layer2 = self._make_layer(feature_dims[1], stride=2, norm_layer=norm_layer)  # 1/4
+
+        # highest resolution 1/4 or 1/8
+        stride = 2 if num_output_scales == 1 else 1
+        self.layer3 = self._make_layer(
+            feature_dims[2],
+            stride=stride,
+            norm_layer=norm_layer,
+        )  # 1/4 or 1/8
+
+        self.conv2 = nn.Conv2d(feature_dims[2], output_dim, 1, 1, 0)
+
+        if self.num_branch > 1:
+            if self.num_branch == 4:
+                strides = (1, 2, 4, 8)
+            elif self.num_branch == 3:
+                strides = (1, 2, 4)
+            elif self.num_branch == 2:
+                strides = (1, 2)
+            else:
+                raise ValueError
+
+            self.trident_conv = MultiScaleTridentConv(
+                output_dim,
+                output_dim,
+                kernel_size=3,
+                strides=strides,
+                paddings=1,
+                num_branch=self.num_branch,
+            )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1, dilation=1, norm_layer=nn.InstanceNorm2d):
+        layer1 = ResidualBlock(self.in_planes, dim, norm_layer=norm_layer, stride=stride, dilation=dilation)
+        layer2 = ResidualBlock(dim, dim, norm_layer=norm_layer, stride=1, dilation=dilation)
+
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)  # 1/2
+        x = self.layer2(x)  # 1/4
+        x = self.layer3(x)  # 1/8 or 1/4
+
+        x = self.conv2(x)
+
+        if self.num_branch > 1:
+            out = self.trident_conv([x] * self.num_branch)  # high to low res
+        else:
+            out = [x]
+
+        return out
diff --git a/tools/scoring/optical_flow/unimatch/geometry.py b/tools/scoring/optical_flow/unimatch/geometry.py
new file mode 100644
index 0000000..df4d8e3
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/geometry.py
@@ -0,0 +1,200 @@
+import torch
+import torch.nn.functional as F
+
+
+def coords_grid(b, h, w, homogeneous=False, device=None):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w))  # [H, W]
+
+    stacks = [x, y]
+
+    if homogeneous:
+        ones = torch.ones_like(x)  # [H, W]
+        stacks.append(ones)
+
+    grid = torch.stack(stacks, dim=0).float()  # [2, H, W] or [3, H, W]
+
+    grid = grid[None].repeat(b, 1, 1, 1)  # [B, 2, H, W] or [B, 3, H, W]
+
+    if device is not None:
+        grid = grid.to(device)
+
+    return grid
+
+
+def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
+    assert device is not None
+
+    x, y = torch.meshgrid(
+        [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)],
+    )
+    grid = torch.stack((x, y), -1).transpose(0, 1).float()  # [H, W, 2]
+
+    return grid
+
+
+def normalize_coords(coords, h, w):
+    # coords: [B, H, W, 2]
+    c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
+    return (coords - c) / c  # [-1, 1]
+
+
+def bilinear_sample(img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False):
+    # img: [B, C, H, W]
+    # sample_coords: [B, 2, H, W] in image scale
+    if sample_coords.size(1) != 2:  # [B, H, W, 2]
+        sample_coords = sample_coords.permute(0, 3, 1, 2)
+
+    b, _, h, w = sample_coords.shape
+
+    # Normalize to [-1, 1]
+    x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
+    y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
+
+    grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, H, W, 2]
+
+    img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+
+    if return_mask:
+        mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1)  # [B, H, W]
+
+        return img, mask
+
+    return img
+
+
+def flow_warp(feature, flow, mask=False, padding_mode="zeros"):
+    b, c, h, w = feature.size()
+    assert flow.size(1) == 2
+
+    grid = coords_grid(b, h, w).to(flow.device) + flow  # [B, 2, H, W]
+
+    return bilinear_sample(feature, grid, padding_mode=padding_mode, return_mask=mask)
+
+
+def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5):
+    # fwd_flow, bwd_flow: [B, 2, H, W]
+    # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837)
+    assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
+    assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
+    flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1)  # [B, H, W]
+
+    warped_bwd_flow = flow_warp(bwd_flow, fwd_flow)  # [B, 2, H, W]
+    warped_fwd_flow = flow_warp(fwd_flow, bwd_flow)  # [B, 2, H, W]
+
+    diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1)  # [B, H, W]
+    diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
+
+    threshold = alpha * flow_mag + beta
+
+    fwd_occ = (diff_fwd > threshold).float()  # [B, H, W]
+    bwd_occ = (diff_bwd > threshold).float()
+
+    return fwd_occ, bwd_occ
+
+
+def back_project(depth, intrinsics):
+    # Back project 2D pixel coords to 3D points
+    # depth: [B, H, W]
+    # intrinsics: [B, 3, 3]
+    b, h, w = depth.shape
+    grid = coords_grid(b, h, w, homogeneous=True, device=depth.device)  # [B, 3, H, W]
+
+    intrinsics_inv = torch.inverse(intrinsics)  # [B, 3, 3]
+
+    points = intrinsics_inv.bmm(grid.view(b, 3, -1)).view(b, 3, h, w) * depth.unsqueeze(1)  # [B, 3, H, W]
+
+    return points
+
+
+def camera_transform(points_ref, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None):
+    # Transform 3D points from reference camera to target camera
+    # points_ref: [B, 3, H, W]
+    # extrinsics_ref: [B, 4, 4]
+    # extrinsics_tgt: [B, 4, 4]
+    # extrinsics_rel: [B, 4, 4], relative pose transform
+    b, _, h, w = points_ref.shape
+
+    if extrinsics_rel is None:
+        extrinsics_rel = torch.bmm(extrinsics_tgt, torch.inverse(extrinsics_ref))  # [B, 4, 4]
+
+    points_tgt = (
+        torch.bmm(extrinsics_rel[:, :3, :3], points_ref.view(b, 3, -1)) + extrinsics_rel[:, :3, -1:]
+    )  # [B, 3, H*W]
+
+    points_tgt = points_tgt.view(b, 3, h, w)  # [B, 3, H, W]
+
+    return points_tgt
+
+
+def reproject(points_tgt, intrinsics, return_mask=False):
+    # reproject to target view
+    # points_tgt: [B, 3, H, W]
+    # intrinsics: [B, 3, 3]
+
+    b, _, h, w = points_tgt.shape
+
+    proj_points = torch.bmm(intrinsics, points_tgt.view(b, 3, -1)).view(b, 3, h, w)  # [B, 3, H, W]
+
+    X = proj_points[:, 0]
+    Y = proj_points[:, 1]
+    Z = proj_points[:, 2].clamp(min=1e-3)
+
+    pixel_coords = torch.stack([X / Z, Y / Z], dim=1).view(b, 2, h, w)  # [B, 2, H, W] in image scale
+
+    if return_mask:
+        # valid mask in pixel space
+        mask = (
+            (pixel_coords[:, 0] >= 0)
+            & (pixel_coords[:, 0] <= (w - 1))
+            & (pixel_coords[:, 1] >= 0)
+            & (pixel_coords[:, 1] <= (h - 1))
+        )  # [B, H, W]
+
+        return pixel_coords, mask
+
+    return pixel_coords
+
+
+def reproject_coords(
+    depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False
+):
+    # Compute reprojection sample coords
+    points_ref = back_project(depth_ref, intrinsics)  # [B, 3, H, W]
+    points_tgt = camera_transform(points_ref, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel)
+
+    if return_mask:
+        reproj_coords, mask = reproject(points_tgt, intrinsics, return_mask=return_mask)  # [B, 2, H, W] in image scale
+
+        return reproj_coords, mask
+
+    reproj_coords = reproject(points_tgt, intrinsics, return_mask=return_mask)  # [B, 2, H, W] in image scale
+
+    return reproj_coords
+
+
+def compute_flow_with_depth_pose(
+    depth_ref, intrinsics, extrinsics_ref=None, extrinsics_tgt=None, extrinsics_rel=None, return_mask=False
+):
+    b, h, w = depth_ref.shape
+    coords_init = coords_grid(b, h, w, device=depth_ref.device)  # [B, 2, H, W]
+
+    if return_mask:
+        reproj_coords, mask = reproject_coords(
+            depth_ref,
+            intrinsics,
+            extrinsics_ref,
+            extrinsics_tgt,
+            extrinsics_rel=extrinsics_rel,
+            return_mask=return_mask,
+        )  # [B, 2, H, W]
+        rigid_flow = reproj_coords - coords_init
+
+        return rigid_flow, mask
+
+    reproj_coords = reproject_coords(
+        depth_ref, intrinsics, extrinsics_ref, extrinsics_tgt, extrinsics_rel=extrinsics_rel, return_mask=return_mask
+    )  # [B, 2, H, W]
+
+    rigid_flow = reproj_coords - coords_init
+
+    return rigid_flow
diff --git a/tools/scoring/optical_flow/unimatch/matching.py b/tools/scoring/optical_flow/unimatch/matching.py
new file mode 100644
index 0000000..fe5e103
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/matching.py
@@ -0,0 +1,307 @@
+import torch
+import torch.nn.functional as F
+
+from .geometry import coords_grid, generate_window_grid, normalize_coords
+
+
+def global_correlation_softmax(
+    feature0,
+    feature1,
+    pred_bidir_flow=False,
+):
+    # global correlation
+    b, c, h, w = feature0.shape
+    feature0 = feature0.view(b, c, -1).permute(0, 2, 1)  # [B, H*W, C]
+    feature1 = feature1.view(b, c, -1)  # [B, C, H*W]
+
+    correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (c**0.5)  # [B, H, W, H, W]
+
+    # flow from softmax
+    init_grid = coords_grid(b, h, w).to(correlation.device)  # [B, 2, H, W]
+    grid = init_grid.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]
+
+    correlation = correlation.view(b, h * w, h * w)  # [B, H*W, H*W]
+
+    if pred_bidir_flow:
+        correlation = torch.cat((correlation, correlation.permute(0, 2, 1)), dim=0)  # [2*B, H*W, H*W]
+        init_grid = init_grid.repeat(2, 1, 1, 1)  # [2*B, 2, H, W]
+        grid = grid.repeat(2, 1, 1)  # [2*B, H*W, 2]
+        b = b * 2
+
+    prob = F.softmax(correlation, dim=-1)  # [B, H*W, H*W]
+
+    correspondence = torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2)  # [B, 2, H, W]
+
+    # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow
+    flow = correspondence - init_grid
+
+    return flow, prob
+
+
+def local_correlation_softmax(
+    feature0,
+    feature1,
+    local_radius,
+    padding_mode="zeros",
+):
+    b, c, h, w = feature0.size()
+    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
+    coords = coords_init.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]
+
+    local_h = 2 * local_radius + 1
+    local_w = 2 * local_radius + 1
+
+    window_grid = generate_window_grid(
+        -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device
+    )  # [2R+1, 2R+1, 2]
+    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1)^2, 2]
+    sample_coords = coords.unsqueeze(-2) + window_grid  # [B, H*W, (2R+1)^2, 2]
+
+    sample_coords_softmax = sample_coords
+
+    # exclude coords that are out of image space
+    valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w)  # [B, H*W, (2R+1)^2]
+    valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h)  # [B, H*W, (2R+1)^2]
+
+    valid = valid_x & valid_y  # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
+
+    # normalize coordinates to [-1, 1]
+    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
+    window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute(
+        0, 2, 1, 3
+    )  # [B, H*W, C, (2R+1)^2]
+    feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c)  # [B, H*W, 1, C]
+
+    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5)  # [B, H*W, (2R+1)^2]
+
+    # mask invalid locations
+    corr[~valid] = -1e9
+
+    prob = F.softmax(corr, -1)  # [B, H*W, (2R+1)^2]
+
+    correspondence = (
+        torch.matmul(prob.unsqueeze(-2), sample_coords_softmax).squeeze(-2).view(b, h, w, 2).permute(0, 3, 1, 2)
+    )  # [B, 2, H, W]
+
+    flow = correspondence - coords_init
+    match_prob = prob
+
+    return flow, match_prob
+
+
+def local_correlation_with_flow(
+    feature0,
+    feature1,
+    flow,
+    local_radius,
+    padding_mode="zeros",
+    dilation=1,
+):
+    b, c, h, w = feature0.size()
+    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
+    coords = coords_init.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]
+
+    local_h = 2 * local_radius + 1
+    local_w = 2 * local_radius + 1
+
+    window_grid = generate_window_grid(
+        -local_radius, local_radius, -local_radius, local_radius, local_h, local_w, device=feature0.device
+    )  # [2R+1, 2R+1, 2]
+    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1)^2, 2]
+    sample_coords = coords.unsqueeze(-2) + window_grid * dilation  # [B, H*W, (2R+1)^2, 2]
+
+    # flow can be zero when using features after transformer
+    if not isinstance(flow, float):
+        sample_coords = sample_coords + flow.view(b, 2, -1).permute(0, 2, 1).unsqueeze(-2)  # [B, H*W, (2R+1)^2, 2]
+    else:
+        assert flow == 0.0
+
+    # normalize coordinates to [-1, 1]
+    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
+    window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True).permute(
+        0, 2, 1, 3
+    )  # [B, H*W, C, (2R+1)^2]
+    feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c)  # [B, H*W, 1, C]
+
+    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5)  # [B, H*W, (2R+1)^2]
+
+    corr = corr.view(b, h, w, -1).permute(0, 3, 1, 2).contiguous()  # [B, (2R+1)^2, H, W]
+
+    return corr
+
+
+def global_correlation_softmax_stereo(
+    feature0,
+    feature1,
+):
+    # global correlation on horizontal direction
+    b, c, h, w = feature0.shape
+
+    x_grid = torch.linspace(0, w - 1, w, device=feature0.device)  # [W]
+
+    feature0 = feature0.permute(0, 2, 3, 1)  # [B, H, W, C]
+    feature1 = feature1.permute(0, 2, 1, 3)  # [B, H, C, W]
+
+    correlation = torch.matmul(feature0, feature1) / (c**0.5)  # [B, H, W, W]
+
+    # mask subsequent positions to make disparity positive
+    mask = torch.triu(torch.ones((w, w)), diagonal=1).type_as(feature0)  # [W, W]
+    valid_mask = (mask == 0).unsqueeze(0).unsqueeze(0).repeat(b, h, 1, 1)  # [B, H, W, W]
+
+    correlation[~valid_mask] = -1e9
+
+    prob = F.softmax(correlation, dim=-1)  # [B, H, W, W]
+
+    correspondence = (x_grid.view(1, 1, 1, w) * prob).sum(-1)  # [B, H, W]
+
+    # NOTE: unlike flow, disparity is typically positive
+    disparity = x_grid.view(1, 1, w).repeat(b, h, 1) - correspondence  # [B, H, W]
+
+    return disparity.unsqueeze(1), prob  # feature resolution
+
+
+def local_correlation_softmax_stereo(
+    feature0,
+    feature1,
+    local_radius,
+):
+    b, c, h, w = feature0.size()
+    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
+    coords = coords_init.view(b, 2, -1).permute(0, 2, 1).contiguous()  # [B, H*W, 2]
+
+    local_h = 1
+    local_w = 2 * local_radius + 1
+
+    window_grid = generate_window_grid(
+        0, 0, -local_radius, local_radius, local_h, local_w, device=feature0.device
+    )  # [1, 2R+1, 2]
+    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1), 2]
+    sample_coords = coords.unsqueeze(-2) + window_grid  # [B, H*W, (2R+1), 2]
+
+    sample_coords_softmax = sample_coords
+
+    # exclude coords that are out of image space
+    valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w)  # [B, H*W, (2R+1)^2]
+    valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h)  # [B, H*W, (2R+1)^2]
+
+    valid = valid_x & valid_y  # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
+
+    # normalize coordinates to [-1, 1]
+    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
+    window_feature = F.grid_sample(feature1, sample_coords_norm, padding_mode="zeros", align_corners=True).permute(
+        0, 2, 1, 3
+    )  # [B, H*W, C, (2R+1)]
+    feature0_view = feature0.permute(0, 2, 3, 1).contiguous().view(b, h * w, 1, c)  # [B, H*W, 1, C]
+
+    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c**0.5)  # [B, H*W, (2R+1)]
+
+    # mask invalid locations
+    corr[~valid] = -1e9
+
+    prob = F.softmax(corr, -1)  # [B, H*W, (2R+1)]
+
+    correspondence = (
+        torch.matmul(prob.unsqueeze(-2), sample_coords_softmax)
+        .squeeze(-2)
+        .view(b, h, w, 2)
+        .permute(0, 3, 1, 2)
+        .contiguous()
+    )  # [B, 2, H, W]
+
+    flow = correspondence - coords_init  # flow at feature resolution
+    match_prob = prob
+
+    flow_x = -flow[:, :1]  # [B, 1, H, W]
+
+    return flow_x, match_prob
+
+
+def correlation_softmax_depth(
+    feature0,
+    feature1,
+    intrinsics,
+    pose,
+    depth_candidates,
+    depth_from_argmax=False,
+    pred_bidir_depth=False,
+):
+    b, c, h, w = feature0.size()
+    assert depth_candidates.dim() == 4  # [B, D, H, W]
+    scale_factor = c**0.5
+
+    if pred_bidir_depth:
+        feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)
+        intrinsics = intrinsics.repeat(2, 1, 1)
+        pose = torch.cat((pose, torch.inverse(pose)), dim=0)
+        depth_candidates = depth_candidates.repeat(2, 1, 1, 1)
+
+    # depth candidates are actually inverse depth
+    warped_feature1 = warp_with_pose_depth_candidates(
+        feature1,
+        intrinsics,
+        pose,
+        1.0 / depth_candidates,
+    )  # [B, C, D, H, W]
+
+    correlation = (feature0.unsqueeze(2) * warped_feature1).sum(1) / scale_factor  # [B, D, H, W]
+
+    match_prob = F.softmax(correlation, dim=1)  # [B, D, H, W]
+
+    # for cross-task transfer (flow -> depth), extract depth with argmax at test time
+    if depth_from_argmax:
+        index = torch.argmax(match_prob, dim=1, keepdim=True)
+        depth = torch.gather(depth_candidates, dim=1, index=index)
+    else:
+        depth = (match_prob * depth_candidates).sum(dim=1, keepdim=True)  # [B, 1, H, W]
+
+    return depth, match_prob
+
+
+def warp_with_pose_depth_candidates(
+    feature1,
+    intrinsics,
+    pose,
+    depth,
+    clamp_min_depth=1e-3,
+):
+    """
+    feature1: [B, C, H, W]
+    intrinsics: [B, 3, 3]
+    pose: [B, 4, 4]
+    depth: [B, D, H, W]
+    """
+
+    assert intrinsics.size(1) == intrinsics.size(2) == 3
+    assert pose.size(1) == pose.size(2) == 4
+    assert depth.dim() == 4
+
+    b, d, h, w = depth.size()
+    c = feature1.size(1)
+
+    with torch.no_grad():
+        # pixel coordinates
+        grid = coords_grid(b, h, w, homogeneous=True, device=depth.device)  # [B, 3, H, W]
+        # back project to 3D and transform viewpoint
+        points = torch.inverse(intrinsics).bmm(grid.view(b, 3, -1))  # [B, 3, H*W]
+        points = torch.bmm(pose[:, :3, :3], points).unsqueeze(2).repeat(1, 1, d, 1) * depth.view(
+            b, 1, d, h * w
+        )  # [B, 3, D, H*W]
+        points = points + pose[:, :3, -1:].unsqueeze(-1)  # [B, 3, D, H*W]
+        # reproject to 2D image plane
+        points = torch.bmm(intrinsics, points.view(b, 3, -1)).view(b, 3, d, h * w)  # [B, 3, D, H*W]
+        pixel_coords = points[:, :2] / points[:, -1:].clamp(min=clamp_min_depth)  # [B, 2, D, H*W]
+
+        # normalize to [-1, 1]
+        x_grid = 2 * pixel_coords[:, 0] / (w - 1) - 1
+        y_grid = 2 * pixel_coords[:, 1] / (h - 1) - 1
+
+        grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, D, H*W, 2]
+
+    # sample features
+    warped_feature = F.grid_sample(
+        feature1, grid.view(b, d * h, w, 2), mode="bilinear", padding_mode="zeros", align_corners=True
+    ).view(
+        b, c, d, h, w
+    )  # [B, C, D, H, W]
+
+    return warped_feature
diff --git a/tools/scoring/optical_flow/unimatch/position.py b/tools/scoring/optical_flow/unimatch/position.py
new file mode 100644
index 0000000..619f356
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/position.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py
+
+import math
+
+import torch
+import torch.nn as nn
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, x):
+        # x = tensor_list.tensors  # [B, C, H, W]
+        # mask = tensor_list.mask  # [B, H, W], input with padding, valid as 0
+        b, c, h, w = x.size()
+        mask = torch.ones((b, h, w), device=x.device)  # [B, H, W]
+        y_embed = mask.cumsum(1, dtype=torch.float32)
+        x_embed = mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
diff --git a/tools/scoring/optical_flow/unimatch/reg_refine.py b/tools/scoring/optical_flow/unimatch/reg_refine.py
new file mode 100644
index 0000000..965f4ca
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/reg_refine.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FlowHead(nn.Module):
+    def __init__(
+        self,
+        input_dim=128,
+        hidden_dim=256,
+        out_dim=2,
+    ):
+        super(FlowHead, self).__init__()
+
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, out_dim, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        out = self.conv2(self.relu(self.conv1(x)))
+
+        return out
+
+
+class SepConvGRU(nn.Module):
+    def __init__(
+        self,
+        hidden_dim=128,
+        input_dim=192 + 128,
+        kernel_size=5,
+    ):
+        padding = (kernel_size - 1) // 2
+
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
+        self.convr1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
+        self.convq1 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (1, kernel_size), padding=(0, padding))
+
+        self.convz2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
+        self.convr2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
+        self.convq2 = nn.Conv2d(hidden_dim + input_dim, hidden_dim, (kernel_size, 1), padding=(padding, 0))
+
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        return h
+
+
+class BasicMotionEncoder(nn.Module):
+    def __init__(
+        self,
+        corr_channels=324,
+        flow_channels=2,
+    ):
+        super(BasicMotionEncoder, self).__init__()
+
+        self.convc1 = nn.Conv2d(corr_channels, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+        self.convf1 = nn.Conv2d(flow_channels, 128, 7, padding=3)
+        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64 + 192, 128 - flow_channels, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+
+class BasicUpdateBlock(nn.Module):
+    def __init__(
+        self,
+        corr_channels=324,
+        hidden_dim=128,
+        context_dim=128,
+        downsample_factor=8,
+        flow_dim=2,
+        bilinear_up=False,
+    ):
+        super(BasicUpdateBlock, self).__init__()
+
+        self.encoder = BasicMotionEncoder(
+            corr_channels=corr_channels,
+            flow_channels=flow_dim,
+        )
+
+        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=context_dim + hidden_dim)
+
+        self.flow_head = FlowHead(
+            hidden_dim,
+            hidden_dim=256,
+            out_dim=flow_dim,
+        )
+
+        if bilinear_up:
+            self.mask = None
+        else:
+            self.mask = nn.Sequential(
+                nn.Conv2d(hidden_dim, 256, 3, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(256, downsample_factor**2 * 9, 1, padding=0),
+            )
+
+    def forward(self, net, inp, corr, flow):
+        motion_features = self.encoder(flow, corr)
+
+        inp = torch.cat([inp, motion_features], dim=1)
+
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        if self.mask is not None:
+            mask = self.mask(net)
+        else:
+            mask = None
+
+        return net, mask, delta_flow
diff --git a/tools/scoring/optical_flow/unimatch/transformer.py b/tools/scoring/optical_flow/unimatch/transformer.py
new file mode 100644
index 0000000..7fdffd1
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/transformer.py
@@ -0,0 +1,339 @@
+import torch
+import torch.nn as nn
+
+from .attention import (
+    single_head_full_attention,
+    single_head_full_attention_1d,
+    single_head_split_window_attention,
+    single_head_split_window_attention_1d,
+)
+from .utils import generate_shift_window_attn_mask, generate_shift_window_attn_mask_1d
+
+
+class TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        d_model=128,
+        nhead=1,
+        no_ffn=False,
+        ffn_dim_expansion=4,
+    ):
+        super(TransformerLayer, self).__init__()
+
+        self.dim = d_model
+        self.nhead = nhead
+        self.no_ffn = no_ffn
+
+        # multi-head attention
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+
+        self.merge = nn.Linear(d_model, d_model, bias=False)
+
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # no ffn after self-attn, with ffn after cross-attn
+        if not self.no_ffn:
+            in_channels = d_model * 2
+            self.mlp = nn.Sequential(
+                nn.Linear(in_channels, in_channels * ffn_dim_expansion, bias=False),
+                nn.GELU(),
+                nn.Linear(in_channels * ffn_dim_expansion, d_model, bias=False),
+            )
+
+            self.norm2 = nn.LayerNorm(d_model)
+
+    def forward(
+        self,
+        source,
+        target,
+        height=None,
+        width=None,
+        shifted_window_attn_mask=None,
+        shifted_window_attn_mask_1d=None,
+        attn_type="swin",
+        with_shift=False,
+        attn_num_splits=None,
+    ):
+        # source, target: [B, L, C]
+        query, key, value = source, target, target
+
+        # for stereo: 2d attn in self-attn, 1d attn in cross-attn
+        is_self_attn = (query - key).abs().max() < 1e-6
+
+        # single-head attention
+        query = self.q_proj(query)  # [B, L, C]
+        key = self.k_proj(key)  # [B, L, C]
+        value = self.v_proj(value)  # [B, L, C]
+
+        if attn_type == "swin" and attn_num_splits > 1:  # self, cross-attn: both swin 2d
+            if self.nhead > 1:
+                # we observe that multihead attention slows down the speed and increases the memory consumption
+                # without bringing obvious performance gains and thus the implementation is removed
+                raise NotImplementedError
+            else:
+                message = single_head_split_window_attention(
+                    query,
+                    key,
+                    value,
+                    num_splits=attn_num_splits,
+                    with_shift=with_shift,
+                    h=height,
+                    w=width,
+                    attn_mask=shifted_window_attn_mask,
+                )
+
+        elif attn_type == "self_swin2d_cross_1d":  # self-attn: swin 2d, cross-attn: full 1d
+            if self.nhead > 1:
+                raise NotImplementedError
+            else:
+                if is_self_attn:
+                    if attn_num_splits > 1:
+                        message = single_head_split_window_attention(
+                            query,
+                            key,
+                            value,
+                            num_splits=attn_num_splits,
+                            with_shift=with_shift,
+                            h=height,
+                            w=width,
+                            attn_mask=shifted_window_attn_mask,
+                        )
+                    else:
+                        # full 2d attn
+                        message = single_head_full_attention(query, key, value)  # [N, L, C]
+
+                else:
+                    # cross attn 1d
+                    message = single_head_full_attention_1d(
+                        query,
+                        key,
+                        value,
+                        h=height,
+                        w=width,
+                    )
+
+        elif attn_type == "self_swin2d_cross_swin1d":  # self-attn: swin 2d, cross-attn: swin 1d
+            if self.nhead > 1:
+                raise NotImplementedError
+            else:
+                if is_self_attn:
+                    if attn_num_splits > 1:
+                        # self attn shift window
+                        message = single_head_split_window_attention(
+                            query,
+                            key,
+                            value,
+                            num_splits=attn_num_splits,
+                            with_shift=with_shift,
+                            h=height,
+                            w=width,
+                            attn_mask=shifted_window_attn_mask,
+                        )
+                    else:
+                        # full 2d attn
+                        message = single_head_full_attention(query, key, value)  # [N, L, C]
+                else:
+                    if attn_num_splits > 1:
+                        assert shifted_window_attn_mask_1d is not None
+                        # cross attn 1d shift
+                        message = single_head_split_window_attention_1d(
+                            query,
+                            key,
+                            value,
+                            num_splits=attn_num_splits,
+                            with_shift=with_shift,
+                            h=height,
+                            w=width,
+                            attn_mask=shifted_window_attn_mask_1d,
+                        )
+                    else:
+                        message = single_head_full_attention_1d(
+                            query,
+                            key,
+                            value,
+                            h=height,
+                            w=width,
+                        )
+
+        else:
+            message = single_head_full_attention(query, key, value)  # [B, L, C]
+
+        message = self.merge(message)  # [B, L, C]
+        message = self.norm1(message)
+
+        if not self.no_ffn:
+            message = self.mlp(torch.cat([source, message], dim=-1))
+            message = self.norm2(message)
+
+        return source + message
+
+
+class TransformerBlock(nn.Module):
+    """self attention + cross attention + FFN"""
+
+    def __init__(
+        self,
+        d_model=128,
+        nhead=1,
+        ffn_dim_expansion=4,
+    ):
+        super(TransformerBlock, self).__init__()
+
+        self.self_attn = TransformerLayer(
+            d_model=d_model,
+            nhead=nhead,
+            no_ffn=True,
+            ffn_dim_expansion=ffn_dim_expansion,
+        )
+
+        self.cross_attn_ffn = TransformerLayer(
+            d_model=d_model,
+            nhead=nhead,
+            ffn_dim_expansion=ffn_dim_expansion,
+        )
+
+    def forward(
+        self,
+        source,
+        target,
+        height=None,
+        width=None,
+        shifted_window_attn_mask=None,
+        shifted_window_attn_mask_1d=None,
+        attn_type="swin",
+        with_shift=False,
+        attn_num_splits=None,
+    ):
+        # source, target: [B, L, C]
+
+        # self attention
+        source = self.self_attn(
+            source,
+            source,
+            height=height,
+            width=width,
+            shifted_window_attn_mask=shifted_window_attn_mask,
+            attn_type=attn_type,
+            with_shift=with_shift,
+            attn_num_splits=attn_num_splits,
+        )
+
+        # cross attention and ffn
+        source = self.cross_attn_ffn(
+            source,
+            target,
+            height=height,
+            width=width,
+            shifted_window_attn_mask=shifted_window_attn_mask,
+            shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
+            attn_type=attn_type,
+            with_shift=with_shift,
+            attn_num_splits=attn_num_splits,
+        )
+
+        return source
+
+
+class FeatureTransformer(nn.Module):
+    def __init__(
+        self,
+        num_layers=6,
+        d_model=128,
+        nhead=1,
+        ffn_dim_expansion=4,
+    ):
+        super(FeatureTransformer, self).__init__()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    d_model=d_model,
+                    nhead=nhead,
+                    ffn_dim_expansion=ffn_dim_expansion,
+                )
+                for i in range(num_layers)
+            ]
+        )
+
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(
+        self,
+        feature0,
+        feature1,
+        attn_type="swin",
+        attn_num_splits=None,
+        **kwargs,
+    ):
+        b, c, h, w = feature0.shape
+        assert self.d_model == c
+
+        feature0 = feature0.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]
+        feature1 = feature1.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]
+
+        # 2d attention
+        if "swin" in attn_type and attn_num_splits > 1:
+            # global and refine use different number of splits
+            window_size_h = h // attn_num_splits
+            window_size_w = w // attn_num_splits
+
+            # compute attn mask once
+            shifted_window_attn_mask = generate_shift_window_attn_mask(
+                input_resolution=(h, w),
+                window_size_h=window_size_h,
+                window_size_w=window_size_w,
+                shift_size_h=window_size_h // 2,
+                shift_size_w=window_size_w // 2,
+                device=feature0.device,
+            )  # [K*K, H/K*W/K, H/K*W/K]
+        else:
+            shifted_window_attn_mask = None
+
+        # 1d attention
+        if "swin1d" in attn_type and attn_num_splits > 1:
+            window_size_w = w // attn_num_splits
+
+            # compute attn mask once
+            shifted_window_attn_mask_1d = generate_shift_window_attn_mask_1d(
+                input_w=w,
+                window_size_w=window_size_w,
+                shift_size_w=window_size_w // 2,
+                device=feature0.device,
+            )  # [K, W/K, W/K]
+        else:
+            shifted_window_attn_mask_1d = None
+
+        # concat feature0 and feature1 in batch dimension to compute in parallel
+        concat0 = torch.cat((feature0, feature1), dim=0)  # [2B, H*W, C]
+        concat1 = torch.cat((feature1, feature0), dim=0)  # [2B, H*W, C]
+
+        for i, layer in enumerate(self.layers):
+            concat0 = layer(
+                concat0,
+                concat1,
+                height=h,
+                width=w,
+                attn_type=attn_type,
+                with_shift="swin" in attn_type and attn_num_splits > 1 and i % 2 == 1,
+                attn_num_splits=attn_num_splits,
+                shifted_window_attn_mask=shifted_window_attn_mask,
+                shifted_window_attn_mask_1d=shifted_window_attn_mask_1d,
+            )
+
+            # update feature1
+            concat1 = torch.cat(concat0.chunk(chunks=2, dim=0)[::-1], dim=0)
+
+        feature0, feature1 = concat0.chunk(chunks=2, dim=0)  # [B, H*W, C]
+
+        # reshape back
+        feature0 = feature0.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()  # [B, C, H, W]
+        feature1 = feature1.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()  # [B, C, H, W]
+
+        return feature0, feature1
diff --git a/tools/scoring/optical_flow/unimatch/trident_conv.py b/tools/scoring/optical_flow/unimatch/trident_conv.py
new file mode 100644
index 0000000..d87579b
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/trident_conv.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# https://github.com/facebookresearch/detectron2/blob/main/projects/TridentNet/tridentnet/trident_conv.py
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
+
+
+class MultiScaleTridentConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        strides=1,
+        paddings=0,
+        dilations=1,
+        dilation=1,
+        groups=1,
+        num_branch=1,
+        test_branch_idx=-1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        super(MultiScaleTridentConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.num_branch = num_branch
+        self.stride = _pair(stride)
+        self.groups = groups
+        self.with_bias = bias
+        self.dilation = dilation
+        if isinstance(paddings, int):
+            paddings = [paddings] * self.num_branch
+        if isinstance(dilations, int):
+            dilations = [dilations] * self.num_branch
+        if isinstance(strides, int):
+            strides = [strides] * self.num_branch
+        self.paddings = [_pair(padding) for padding in paddings]
+        self.dilations = [_pair(dilation) for dilation in dilations]
+        self.strides = [_pair(stride) for stride in strides]
+        self.test_branch_idx = test_branch_idx
+        self.norm = norm
+        self.activation = activation
+
+        assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1
+
+        self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, inputs):
+        num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
+        assert len(inputs) == num_branch
+
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, stride, padding, self.dilation, self.groups)
+                for input, stride, padding in zip(inputs, self.strides, self.paddings)
+            ]
+        else:
+            outputs = [
+                F.conv2d(
+                    inputs[0],
+                    self.weight,
+                    self.bias,
+                    self.strides[self.test_branch_idx] if self.test_branch_idx == -1 else self.strides[-1],
+                    self.paddings[self.test_branch_idx] if self.test_branch_idx == -1 else self.paddings[-1],
+                    self.dilation,
+                    self.groups,
+                )
+            ]
+
+        if self.norm is not None:
+            outputs = [self.norm(x) for x in outputs]
+        if self.activation is not None:
+            outputs = [self.activation(x) for x in outputs]
+        return outputs
diff --git a/tools/scoring/optical_flow/unimatch/unimatch.py b/tools/scoring/optical_flow/unimatch/unimatch.py
new file mode 100644
index 0000000..c625b99
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/unimatch.py
@@ -0,0 +1,393 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .attention import SelfAttnPropagation
+from .backbone import CNNEncoder
+from .geometry import compute_flow_with_depth_pose, flow_warp
+from .matching import (
+    correlation_softmax_depth,
+    global_correlation_softmax,
+    global_correlation_softmax_stereo,
+    local_correlation_softmax,
+    local_correlation_softmax_stereo,
+    local_correlation_with_flow,
+)
+from .reg_refine import BasicUpdateBlock
+from .transformer import FeatureTransformer
+from .utils import feature_add_position, normalize_img, upsample_flow_with_mask
+
+
+class UniMatch(nn.Module):
+    def __init__(
+        self,
+        num_scales=1,
+        feature_channels=128,
+        upsample_factor=8,
+        num_head=1,
+        ffn_dim_expansion=4,
+        num_transformer_layers=6,
+        reg_refine=False,  # optional local regression refinement
+        task="flow",
+    ):
+        super(UniMatch, self).__init__()
+
+        self.feature_channels = feature_channels
+        self.num_scales = num_scales
+        self.upsample_factor = upsample_factor
+        self.reg_refine = reg_refine
+
+        # CNN
+        self.backbone = CNNEncoder(output_dim=feature_channels, num_output_scales=num_scales)
+
+        # Transformer
+        self.transformer = FeatureTransformer(
+            num_layers=num_transformer_layers,
+            d_model=feature_channels,
+            nhead=num_head,
+            ffn_dim_expansion=ffn_dim_expansion,
+        )
+
+        # propagation with self-attn
+        self.feature_flow_attn = SelfAttnPropagation(in_channels=feature_channels)
+
+        if not self.reg_refine or task == "depth":
+            # convex upsampling simiar to RAFT
+            # concat feature0 and low res flow as input
+            self.upsampler = nn.Sequential(
+                nn.Conv2d(2 + feature_channels, 256, 3, 1, 1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(256, upsample_factor**2 * 9, 1, 1, 0),
+            )
+            # thus far, all the learnable parameters are task-agnostic
+
+        if reg_refine:
+            # optional task-specific local regression refinement
+            self.refine_proj = nn.Conv2d(128, 256, 1)
+            self.refine = BasicUpdateBlock(
+                corr_channels=(2 * 4 + 1) ** 2,
+                downsample_factor=upsample_factor,
+                flow_dim=2 if task == "flow" else 1,
+                bilinear_up=task == "depth",
+            )
+
+    def extract_feature(self, img0, img1):
+        concat = torch.cat((img0, img1), dim=0)  # [2B, C, H, W]
+        features = self.backbone(concat)  # list of [2B, C, H, W], resolution from high to low
+
+        # reverse: resolution from low to high
+        features = features[::-1]
+
+        feature0, feature1 = [], []
+
+        for i in range(len(features)):
+            feature = features[i]
+            chunks = torch.chunk(feature, 2, 0)  # tuple
+            feature0.append(chunks[0])
+            feature1.append(chunks[1])
+
+        return feature0, feature1
+
+    def upsample_flow(self, flow, feature, bilinear=False, upsample_factor=8, is_depth=False):
+        if bilinear:
+            multiplier = 1 if is_depth else upsample_factor
+            up_flow = (
+                F.interpolate(flow, scale_factor=upsample_factor, mode="bilinear", align_corners=True) * multiplier
+            )
+        else:
+            concat = torch.cat((flow, feature), dim=1)
+            mask = self.upsampler(concat)
+            up_flow = upsample_flow_with_mask(flow, mask, upsample_factor=self.upsample_factor, is_depth=is_depth)
+
+        return up_flow
+
+    def forward(
+        self,
+        img0,
+        img1,
+        attn_type=None,
+        attn_splits_list=None,
+        corr_radius_list=None,
+        prop_radius_list=None,
+        num_reg_refine=1,
+        pred_bidir_flow=False,
+        task="flow",
+        intrinsics=None,
+        pose=None,  # relative pose transform
+        min_depth=1.0 / 0.5,  # inverse depth range
+        max_depth=1.0 / 10,
+        num_depth_candidates=64,
+        depth_from_argmax=False,
+        pred_bidir_depth=False,
+        **kwargs,
+    ):
+        if pred_bidir_flow:
+            assert task == "flow"
+
+        if task == "depth":
+            assert self.num_scales == 1  # multi-scale depth model is not supported yet
+
+        results_dict = {}
+        flow_preds = []
+
+        if task == "flow":
+            # stereo and depth tasks have normalized img in dataloader
+            img0, img1 = normalize_img(img0, img1)  # [B, 3, H, W]
+
+        # list of features, resolution low to high
+        feature0_list, feature1_list = self.extract_feature(img0, img1)  # list of features
+
+        flow = None
+
+        if task != "depth":
+            assert len(attn_splits_list) == len(corr_radius_list) == len(prop_radius_list) == self.num_scales
+        else:
+            assert len(attn_splits_list) == len(prop_radius_list) == self.num_scales == 1
+
+        for scale_idx in range(self.num_scales):
+            feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx]
+
+            if pred_bidir_flow and scale_idx > 0:
+                # predicting bidirectional flow with refinement
+                feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat((feature1, feature0), dim=0)
+
+            feature0_ori, feature1_ori = feature0, feature1
+
+            upsample_factor = self.upsample_factor * (2 ** (self.num_scales - 1 - scale_idx))
+
+            if task == "depth":
+                # scale intrinsics
+                intrinsics_curr = intrinsics.clone()
+                intrinsics_curr[:, :2] = intrinsics_curr[:, :2] / upsample_factor
+
+            if scale_idx > 0:
+                assert task != "depth"  # not supported for multi-scale depth model
+                flow = F.interpolate(flow, scale_factor=2, mode="bilinear", align_corners=True) * 2
+
+            if flow is not None:
+                assert task != "depth"
+                flow = flow.detach()
+
+                if task == "stereo":
+                    # construct flow vector for disparity
+                    # flow here is actually disparity
+                    zeros = torch.zeros_like(flow)  # [B, 1, H, W]
+                    # NOTE: reverse disp, disparity is positive
+                    displace = torch.cat((-flow, zeros), dim=1)  # [B, 2, H, W]
+                    feature1 = flow_warp(feature1, displace)  # [B, C, H, W]
+                elif task == "flow":
+                    feature1 = flow_warp(feature1, flow)  # [B, C, H, W]
+                else:
+                    raise NotImplementedError
+
+            attn_splits = attn_splits_list[scale_idx]
+            if task != "depth":
+                corr_radius = corr_radius_list[scale_idx]
+            prop_radius = prop_radius_list[scale_idx]
+
+            # add position to features
+            feature0, feature1 = feature_add_position(feature0, feature1, attn_splits, self.feature_channels)
+
+            # Transformer
+            feature0, feature1 = self.transformer(
+                feature0,
+                feature1,
+                attn_type=attn_type,
+                attn_num_splits=attn_splits,
+            )
+
+            # correlation and softmax
+            if task == "depth":
+                # first generate depth candidates
+                b, _, h, w = feature0.size()
+                depth_candidates = torch.linspace(min_depth, max_depth, num_depth_candidates).type_as(feature0)
+                depth_candidates = depth_candidates.view(1, num_depth_candidates, 1, 1).repeat(
+                    b, 1, h, w
+                )  # [B, D, H, W]
+
+                flow_pred = correlation_softmax_depth(
+                    feature0,
+                    feature1,
+                    intrinsics_curr,
+                    pose,
+                    depth_candidates=depth_candidates,
+                    depth_from_argmax=depth_from_argmax,
+                    pred_bidir_depth=pred_bidir_depth,
+                )[0]
+
+            else:
+                if corr_radius == -1:  # global matching
+                    if task == "flow":
+                        flow_pred = global_correlation_softmax(feature0, feature1, pred_bidir_flow)[0]
+                    elif task == "stereo":
+                        flow_pred = global_correlation_softmax_stereo(feature0, feature1)[0]
+                    else:
+                        raise NotImplementedError
+                else:  # local matching
+                    if task == "flow":
+                        flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[0]
+                    elif task == "stereo":
+                        flow_pred = local_correlation_softmax_stereo(feature0, feature1, corr_radius)[0]
+                    else:
+                        raise NotImplementedError
+
+            # flow or residual flow
+            flow = flow + flow_pred if flow is not None else flow_pred
+
+            if task == "stereo":
+                flow = flow.clamp(min=0)  # positive disparity
+
+            # upsample to the original resolution for supervison at training time only
+            if self.training:
+                flow_bilinear = self.upsample_flow(
+                    flow, None, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
+                )
+                flow_preds.append(flow_bilinear)
+
+            # flow propagation with self-attn
+            if (pred_bidir_flow or pred_bidir_depth) and scale_idx == 0:
+                feature0 = torch.cat((feature0, feature1), dim=0)  # [2*B, C, H, W] for propagation
+
+            flow = self.feature_flow_attn(
+                feature0,
+                flow.detach(),
+                local_window_attn=prop_radius > 0,
+                local_window_radius=prop_radius,
+            )
+
+            # bilinear exclude the last one
+            if self.training and scale_idx < self.num_scales - 1:
+                flow_up = self.upsample_flow(
+                    flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
+                )
+                flow_preds.append(flow_up)
+
+            if scale_idx == self.num_scales - 1:
+                if not self.reg_refine:
+                    # upsample to the original image resolution
+
+                    if task == "stereo":
+                        flow_pad = torch.cat((-flow, torch.zeros_like(flow)), dim=1)  # [B, 2, H, W]
+                        flow_up_pad = self.upsample_flow(flow_pad, feature0)
+                        flow_up = -flow_up_pad[:, :1]  # [B, 1, H, W]
+                    elif task == "depth":
+                        depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1)  # [B, 2, H, W]
+                        depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp(
+                            min=min_depth, max=max_depth
+                        )
+                        flow_up = depth_up_pad[:, :1]  # [B, 1, H, W]
+                    else:
+                        flow_up = self.upsample_flow(flow, feature0)
+
+                    flow_preds.append(flow_up)
+                else:
+                    # task-specific local regression refinement
+                    # supervise current flow
+                    if self.training:
+                        flow_up = self.upsample_flow(
+                            flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=task == "depth"
+                        )
+                        flow_preds.append(flow_up)
+
+                    assert num_reg_refine > 0
+                    for refine_iter_idx in range(num_reg_refine):
+                        flow = flow.detach()
+
+                        if task == "stereo":
+                            zeros = torch.zeros_like(flow)  # [B, 1, H, W]
+                            # NOTE: reverse disp, disparity is positive
+                            displace = torch.cat((-flow, zeros), dim=1)  # [B, 2, H, W]
+                            correlation = local_correlation_with_flow(
+                                feature0_ori,
+                                feature1_ori,
+                                flow=displace,
+                                local_radius=4,
+                            )  # [B, (2R+1)^2, H, W]
+                        elif task == "depth":
+                            if pred_bidir_depth and refine_iter_idx == 0:
+                                intrinsics_curr = intrinsics_curr.repeat(2, 1, 1)
+                                pose = torch.cat((pose, torch.inverse(pose)), dim=0)
+
+                                feature0_ori, feature1_ori = torch.cat((feature0_ori, feature1_ori), dim=0), torch.cat(
+                                    (feature1_ori, feature0_ori), dim=0
+                                )
+
+                            flow_from_depth = compute_flow_with_depth_pose(
+                                1.0 / flow.squeeze(1),
+                                intrinsics_curr,
+                                extrinsics_rel=pose,
+                            )
+
+                            correlation = local_correlation_with_flow(
+                                feature0_ori,
+                                feature1_ori,
+                                flow=flow_from_depth,
+                                local_radius=4,
+                            )  # [B, (2R+1)^2, H, W]
+
+                        else:
+                            correlation = local_correlation_with_flow(
+                                feature0_ori,
+                                feature1_ori,
+                                flow=flow,
+                                local_radius=4,
+                            )  # [B, (2R+1)^2, H, W]
+
+                        proj = self.refine_proj(feature0)
+
+                        net, inp = torch.chunk(proj, chunks=2, dim=1)
+
+                        net = torch.tanh(net)
+                        inp = torch.relu(inp)
+
+                        net, up_mask, residual_flow = self.refine(
+                            net,
+                            inp,
+                            correlation,
+                            flow.clone(),
+                        )
+
+                        if task == "depth":
+                            flow = (flow - residual_flow).clamp(min=min_depth, max=max_depth)
+                        else:
+                            flow = flow + residual_flow
+
+                        if task == "stereo":
+                            flow = flow.clamp(min=0)  # positive
+
+                        if self.training or refine_iter_idx == num_reg_refine - 1:
+                            if task == "depth":
+                                if refine_iter_idx < num_reg_refine - 1:
+                                    # bilinear upsampling
+                                    flow_up = self.upsample_flow(
+                                        flow, feature0, bilinear=True, upsample_factor=upsample_factor, is_depth=True
+                                    )
+                                else:
+                                    # last one convex upsampling
+                                    # NOTE: clamp depth due to the zero padding in the unfold in the convex upsampling
+                                    # pad depth to 2 channels as flow
+                                    depth_pad = torch.cat((flow, torch.zeros_like(flow)), dim=1)  # [B, 2, H, W]
+                                    depth_up_pad = self.upsample_flow(depth_pad, feature0, is_depth=True).clamp(
+                                        min=min_depth, max=max_depth
+                                    )
+                                    flow_up = depth_up_pad[:, :1]  # [B, 1, H, W]
+
+                            else:
+                                flow_up = upsample_flow_with_mask(
+                                    flow, up_mask, upsample_factor=self.upsample_factor, is_depth=task == "depth"
+                                )
+
+                            flow_preds.append(flow_up)
+
+        if task == "stereo":
+            for i in range(len(flow_preds)):
+                flow_preds[i] = flow_preds[i].squeeze(1)  # [B, H, W]
+
+        # convert inverse depth to depth
+        if task == "depth":
+            for i in range(len(flow_preds)):
+                flow_preds[i] = 1.0 / flow_preds[i].squeeze(1)  # [B, H, W]
+
+        results_dict.update({"flow_preds": flow_preds})
+
+        return results_dict
diff --git a/tools/scoring/optical_flow/unimatch/utils.py b/tools/scoring/optical_flow/unimatch/utils.py
new file mode 100644
index 0000000..60f40be
--- /dev/null
+++ b/tools/scoring/optical_flow/unimatch/utils.py
@@ -0,0 +1,219 @@
+import torch
+import torch.nn.functional as F
+
+from .position import PositionEmbeddingSine
+
+
+def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
+    assert device is not None
+
+    x, y = torch.meshgrid(
+        [torch.linspace(w_min, w_max, len_w, device=device), torch.linspace(h_min, h_max, len_h, device=device)],
+    )
+    grid = torch.stack((x, y), -1).transpose(0, 1).float()  # [H, W, 2]
+
+    return grid
+
+
+def normalize_coords(coords, h, w):
+    # coords: [B, H, W, 2]
+    c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
+    return (coords - c) / c  # [-1, 1]
+
+
+def normalize_img(img0, img1):
+    # loaded images are in [0, 255]
+    # normalize by ImageNet mean and std
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device)
+    img0 = (img0 / 255.0 - mean) / std
+    img1 = (img1 / 255.0 - mean) / std
+
+    return img0, img1
+
+
+def split_feature(
+    feature,
+    num_splits=2,
+    channel_last=False,
+):
+    if channel_last:  # [B, H, W, C]
+        b, h, w, c = feature.size()
+        assert h % num_splits == 0 and w % num_splits == 0
+
+        b_new = b * num_splits * num_splits
+        h_new = h // num_splits
+        w_new = w // num_splits
+
+        feature = (
+            feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c)
+            .permute(0, 1, 3, 2, 4, 5)
+            .reshape(b_new, h_new, w_new, c)
+        )  # [B*K*K, H/K, W/K, C]
+    else:  # [B, C, H, W]
+        b, c, h, w = feature.size()
+        assert h % num_splits == 0 and w % num_splits == 0
+
+        b_new = b * num_splits * num_splits
+        h_new = h // num_splits
+        w_new = w // num_splits
+
+        feature = (
+            feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits)
+            .permute(0, 2, 4, 1, 3, 5)
+            .reshape(b_new, c, h_new, w_new)
+        )  # [B*K*K, C, H/K, W/K]
+
+    return feature
+
+
+def merge_splits(
+    splits,
+    num_splits=2,
+    channel_last=False,
+):
+    if channel_last:  # [B*K*K, H/K, W/K, C]
+        b, h, w, c = splits.size()
+        new_b = b // num_splits // num_splits
+
+        splits = splits.view(new_b, num_splits, num_splits, h, w, c)
+        merge = (
+            splits.permute(0, 1, 3, 2, 4, 5).contiguous().view(new_b, num_splits * h, num_splits * w, c)
+        )  # [B, H, W, C]
+    else:  # [B*K*K, C, H/K, W/K]
+        b, c, h, w = splits.size()
+        new_b = b // num_splits // num_splits
+
+        splits = splits.view(new_b, num_splits, num_splits, c, h, w)
+        merge = (
+            splits.permute(0, 3, 1, 4, 2, 5).contiguous().view(new_b, c, num_splits * h, num_splits * w)
+        )  # [B, C, H, W]
+
+    return merge
+
+
+def generate_shift_window_attn_mask(
+    input_resolution, window_size_h, window_size_w, shift_size_h, shift_size_w, device=torch.device("cuda")
+):
+    # ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+    # calculate attention mask for SW-MSA
+    h, w = input_resolution
+    img_mask = torch.zeros((1, h, w, 1)).to(device)  # 1 H W 1
+    h_slices = (slice(0, -window_size_h), slice(-window_size_h, -shift_size_h), slice(-shift_size_h, None))
+    w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+
+    mask_windows = split_feature(img_mask, num_splits=input_resolution[-1] // window_size_w, channel_last=True)
+
+    mask_windows = mask_windows.view(-1, window_size_h * window_size_w)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+    return attn_mask
+
+
+def feature_add_position(feature0, feature1, attn_splits, feature_channels):
+    pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2)
+
+    if attn_splits > 1:  # add position in splited window
+        feature0_splits = split_feature(feature0, num_splits=attn_splits)
+        feature1_splits = split_feature(feature1, num_splits=attn_splits)
+
+        position = pos_enc(feature0_splits)
+
+        feature0_splits = feature0_splits + position
+        feature1_splits = feature1_splits + position
+
+        feature0 = merge_splits(feature0_splits, num_splits=attn_splits)
+        feature1 = merge_splits(feature1_splits, num_splits=attn_splits)
+    else:
+        position = pos_enc(feature0)
+
+        feature0 = feature0 + position
+        feature1 = feature1 + position
+
+    return feature0, feature1
+
+
+def upsample_flow_with_mask(flow, up_mask, upsample_factor, is_depth=False):
+    # convex upsampling following raft
+
+    mask = up_mask
+    b, flow_channel, h, w = flow.shape
+    mask = mask.view(b, 1, 9, upsample_factor, upsample_factor, h, w)  # [B, 1, 9, K, K, H, W]
+    mask = torch.softmax(mask, dim=2)
+
+    multiplier = 1 if is_depth else upsample_factor
+    up_flow = F.unfold(multiplier * flow, [3, 3], padding=1)
+    up_flow = up_flow.view(b, flow_channel, 9, 1, 1, h, w)  # [B, 2, 9, 1, 1, H, W]
+
+    up_flow = torch.sum(mask * up_flow, dim=2)  # [B, 2, K, K, H, W]
+    up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)  # [B, 2, K, H, K, W]
+    up_flow = up_flow.reshape(b, flow_channel, upsample_factor * h, upsample_factor * w)  # [B, 2, K*H, K*W]
+
+    return up_flow
+
+
+def split_feature_1d(
+    feature,
+    num_splits=2,
+):
+    # feature: [B, W, C]
+    b, w, c = feature.size()
+    assert w % num_splits == 0
+
+    b_new = b * num_splits
+    w_new = w // num_splits
+
+    feature = feature.view(b, num_splits, w // num_splits, c).view(b_new, w_new, c)  # [B*K, W/K, C]
+
+    return feature
+
+
+def merge_splits_1d(
+    splits,
+    h,
+    num_splits=2,
+):
+    b, w, c = splits.size()
+    new_b = b // num_splits // h
+
+    splits = splits.view(new_b, h, num_splits, w, c)
+    merge = splits.view(new_b, h, num_splits * w, c)  # [B, H, W, C]
+
+    return merge
+
+
+def window_partition_1d(x, window_size_w):
+    """
+    Args:
+        x: (B, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, C)
+    """
+    B, W, C = x.shape
+    x = x.view(B, W // window_size_w, window_size_w, C).view(-1, window_size_w, C)
+    return x
+
+
+def generate_shift_window_attn_mask_1d(input_w, window_size_w, shift_size_w, device=torch.device("cuda")):
+    # calculate attention mask for SW-MSA
+    img_mask = torch.zeros((1, input_w, 1)).to(device)  # 1 W 1
+    w_slices = (slice(0, -window_size_w), slice(-window_size_w, -shift_size_w), slice(-shift_size_w, None))
+    cnt = 0
+    for w in w_slices:
+        img_mask[:, w, :] = cnt
+        cnt += 1
+
+    mask_windows = window_partition_1d(img_mask, window_size_w)  # nW, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size_w)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)  # nW, window_size, window_size
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+    return attn_mask