feat: init my-sora — merge Open-Sora v2.0 (11B) + v1.3 data pipeline tools

This commit is contained in:
hailin 2026-03-05 22:53:15 -08:00
commit 410c20d4fa
226 changed files with 52659 additions and 0 deletions

22
.github/workflows/close_issue.yaml vendored Normal file
View File

@ -0,0 +1,22 @@
name: Close inactive issues
on:
schedule:
- cron: "30 1 * * *"
jobs:
close-issues:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v9
with:
days-before-issue-stale: 7
days-before-issue-close: 7
stale-issue-label: "stale"
stale-issue-message: "This issue is stale because it has been open for 7 days with no activity."
close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
repo-token: ${{ secrets.GITHUB_TOKEN }}

30
.github/workflows/github_page.yaml vendored Normal file
View File

@ -0,0 +1,30 @@
name: GitHub Pages
on:
workflow_dispatch:
jobs:
deploy:
runs-on: ubuntu-22.04
permissions:
contents: write
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
steps:
- uses: actions/checkout@v3
with:
ref: gallery
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 20
- run: npm install
- run: npm run build
- name: Deploy
uses: peaceiris/actions-gh-pages@v3
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./build

198
.gitignore vendored Normal file
View File

@ -0,0 +1,198 @@
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.vscode/
# macos
*.DS_Store
# misc files
data/
dataset/
runs/
checkpoints/
outputs/
outputs
samples/
samples
logs/
pretrained_models/
pretrained_models
evaluation_results/
cache/
*.swp
debug/
# Secret files
hostfiles/
hostfile*
run.sh
gradio_cached_examples/
wandb/
# npm
node_modules/
package-lock.json
package.json
exps
ckpts
flash-attention
datasets

31
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,31 @@
repos:
- repo: https://github.com/PyCQA/autoflake
rev: v2.2.1
hooks:
- id: autoflake
name: autoflake (python)
args: ['--in-place']
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
name: sort all imports (python)
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.9.1
hooks:
- id: black
name: black formatter
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: check-yaml
- id: check-merge-conflict
- id: check-case-conflict
- id: trailing-whitespace
- id: end-of-file-fixer
- id: mixed-line-ending
args: ['--fix=lf']

100
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,100 @@
# Contributing
The Open-Sora project welcomes any constructive contribution from the community and the team is more than willing to work on problems you have encountered to make it a better project.
## Development Environment Setup
To contribute to Open-Sora, we would like to first guide you to set up a proper development environment so that you can better implement your code. You can install this library from source with the `editable` flag (`-e`, for development mode) so that your change to the source code will be reflected in runtime without re-installation.
You can refer to the [Installation Section](./README.md#installation) and replace `pip install -v .` with `pip install -v -e .`.
### Code Style
We have some static checks when you commit your code change, please make sure you can pass all the tests and make sure the coding style meets our requirements. We use pre-commit hook to make sure the code is aligned with the writing standard. To set up the code style checking, you need to follow the steps below.
```shell
# these commands are executed under the Open-Sora directory
pip install pre-commit
pre-commit install
```
Code format checking will be automatically executed when you commit your changes.
## Contribution Guide
You need to follow these steps below to make contribution to the main repository via pull request. You can learn about the details of pull request [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests).
### 1. Fork the Official Repository
Firstly, you need to visit the [Open-Sora repository](https://github.com/hpcaitech/Open-Sora) and fork into your own account. The `fork` button is at the right top corner of the web page alongside with buttons such as `watch` and `star`.
Now, you can clone your own forked repository into your local environment.
```shell
git clone https://github.com/<YOUR-USERNAME>/Open-Sora.git
```
### 2. Configure Git
You need to set the official repository as your upstream so that you can synchronize with the latest update in the official repository. You can learn about upstream [here](https://www.atlassian.com/git/tutorials/git-forks-and-upstreams).
Then add the original repository as upstream
```shell
cd Open-Sora
git remote add upstream https://github.com/hpcaitech/Open-Sora.git
```
you can use the following command to verify that the remote is set. You should see both `origin` and `upstream` in the output.
```shell
git remote -v
```
### 3. Synchronize with Official Repository
Before you make changes to the codebase, it is always good to fetch the latest updates in the official repository. In order to do so, you can use the commands below.
```shell
git fetch upstream
git checkout main
git merge upstream/main
git push origin main
```
### 5. Create a New Branch
You should not make changes to the `main` branch of your forked repository as this might make upstream synchronization difficult. You can create a new branch with the appropriate name. General branch name format should start with `hotfix/` and `feature/`. `hotfix` is for bug fix and `feature` is for addition of a new feature.
```shell
git checkout -b <NEW-BRANCH-NAME>
```
### 6. Implementation and Code Commit
Now you can implement your code change in the source code. Remember that you installed the system in development, thus you do not need to uninstall and install to make the code take effect. The code change will be reflected in every new PyThon execution.
You can commit and push the changes to your local repository. The changes should be kept logical, modular and atomic.
```shell
git add -A
git commit -m "<COMMIT-MESSAGE>"
git push -u origin <NEW-BRANCH-NAME>
```
### 7. Open a Pull Request
You can now create a pull request on the GitHub webpage of your repository. The source branch is `<NEW-BRANCH-NAME>` of your repository and the target branch should be `main` of `hpcaitech/Open-Sora`. After creating this pull request, you should be able to see it [here](https://github.com/hpcaitech/Open-Sora/pulls).
The Open-Sora team will review your code change and merge your code if applicable.
## FQA
1. `pylint` cannot recognize some members:
Add this into your `settings.json` in VSCode:
```json
"pylint.args": [
"--generated-members=numpy.* ,torch.*,cv2.*",
],
```

26
Dockerfile Normal file
View File

@ -0,0 +1,26 @@
FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0
# metainformation
LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora"
LABEL org.opencontainers.image.licenses = "Apache License 2.0"
LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0"
# inatall library dependencies
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
# install flash attention
RUN pip install flash-attn --no-build-isolation
# install apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
# install xformers
RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121
# Set the working directory
WORKDIR /workspace/Open-Sora
# Copy the current directory contents into the container at /workspace/Open-Sora
COPY . .
# install this project
RUN pip install -v .

350
LICENSE Normal file
View File

@ -0,0 +1,350 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2024 HPC-AI Technology Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=========================================================================
This project is inspired by the listed projects and is subject to the following licenses:
10. [T5: Text-To-Text Transfer Transformer](https://github.com/google-research/text-to-text-transfer-transformer)
Copyright 2019 Google
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
11. [CLIP](https://github.com/openai/CLIP/tree/main)
MIT License
Copyright (c) 2021 OpenAI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
12. [FLUX](https://github.com/black-forest-labs/flux)
Copyright 2024 Black Forest Labs
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
13. [EfficientViT](https://github.com/mit-han-lab/efficientvit)
Copyright [2023] [Han Cai]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
14. [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/tree/main)
TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
Tencent HunyuanVideo Release Date: December 3, 2024
THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
1. DEFINITIONS.
a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
c. “Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
f. “Materials” shall mean, collectively, Tencents proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
g. “Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
h. “Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
i. “Tencent,” “We” or “Us” shall mean THL A29 Limited.
j. “Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanVideo released at [https://github.com/Tencent/HunyuanVideo].
k. “Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
n. “including” shall mean including but not limited to.
2. GRANT OF RIGHTS.
We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencents intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
3. DISTRIBUTION.
You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
a. You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
b. You must cause any modified files to carry prominent notices stating that You changed the files;
c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2024 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
4. ADDITIONAL COMMERCIAL TERMS.
If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
5. RULES OF USE.
a. Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
b. You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
c. You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
6. INTELLECTUAL PROPERTY.
a. Subject to Tencents ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
b. No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Partys use or distribution of the Tencent Hunyuan Works.
d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTYS USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
8. SURVIVAL AND TERMINATION.
a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
9. GOVERNING LAW AND JURISDICTION.
a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the Peoples Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the Peoples Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
EXHIBIT A
ACCEPTABLE USE POLICY
Tencent reserves the right to update this Acceptable Use Policy from time to time.
Last modified: November 5, 2024
Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
1. Outside the Territory;
2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
3. To harm Yourself or others;
4. To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
5. To override or circumvent the safety guardrails and safeguards We have put in place;
6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
9. To intentionally defame, disparage or otherwise harass others;
10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
11. To generate or disseminate personal identifiable information with the purpose of harming others;
12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
13. To impersonate another individual without consent, authorization, or legal right;
14. To make high-stakes automated decisions in domains that affect an individuals safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
19. For military purposes;
20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.

349
README.md Normal file
View File

@ -0,0 +1,349 @@
<p align="center">
<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/icon.png" width="250"/>
</p>
<div align="center">
<a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
<a href="https://arxiv.org/abs/2503.09642v1"><img src="https://img.shields.io/static/v1?label=Tech Report 2.0&message=Arxiv&color=red"></a>
<a href="https://arxiv.org/abs/2412.20404"><img src="https://img.shields.io/static/v1?label=Tech Report 1.2&message=Arxiv&color=red"></a>
<a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
</div>
<div align="center">
<a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
<a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
<a href="https://x.com/YangYou1991/status/1899973689460044010"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
<a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
</div>
## Open-Sora: Democratizing Efficient Video Production for All
We design and implement **Open-Sora**, an initiative dedicated to **efficiently** producing high-quality video. We hope to make the model,
tools and all details accessible to all. By embracing **open-source** principles,
Open-Sora not only democratizes access to advanced video generation techniques, but also offers a
streamlined and user-friendly platform that simplifies the complexities of video generation.
With Open-Sora, our goal is to foster innovation, creativity, and inclusivity within the field of content creation.
🎬 For a professional AI video-generation product, try [Video Ocean](https://video-ocean.com/) — powered by a superior model.
<div align="center">
<a href="https://video-ocean.com/">
<img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/3.gif" width="850" />
</a>
</div>
<div align="center">
<a href="https://hpc-ai.com/?utm_source=github&utm_medium=social&utm_campaign=promotion-opensora">
<img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/1.gif" width="850" />
</a>
</div>
<!-- [[中文文档](/docs/zh_CN/README.md)] [[潞晨云](https://cloud.luchentech.com/)|[OpenSora镜像](https://cloud.luchentech.com/doc/docs/image/open-sora/)|[视频教程](https://www.bilibili.com/video/BV1ow4m1e7PX/?vd_source=c6b752764cd36ff0e535a768e35d98d2)] -->
## 📰 News
- **[2025.03.12]** 🔥 We released **Open-Sora 2.0** (11B). 🎬 11B model achieves [on-par performance](#evaluation) with 11B HunyuanVideo & 30B Step-Video on 📐VBench & 📊Human Preference. 🛠️ Fully open-source: checkpoints and training codes for training with only **$200K**. [[report]](https://arxiv.org/abs/2503.09642v1)
- **[2025.02.20]** 🔥 We released **Open-Sora 1.3** (1B). With the upgraded VAE and Transformer architecture, the quality of our generated videos has been greatly improved 🚀. [[checkpoints]](#open-sora-13-model-weights) [[report]](/docs/report_04.md) [[demo]](https://huggingface.co/spaces/hpcai-tech/open-sora)
- **[2024.12.23]** The development cost of video generation models has saved by 50%! Open-source solutions are now available with H200 GPU vouchers. [[blog]](https://company.hpc-ai.com/blog/the-development-cost-of-video-generation-models-has-saved-by-50-open-source-solutions-are-now-available-with-h200-gpu-vouchers) [[code]](https://github.com/hpcaitech/Open-Sora/blob/main/scripts/train.py) [[vouchers]](https://colossalai.org/zh-Hans/docs/get_started/bonus/)
- **[2024.06.17]** We released **Open-Sora 1.2**, which includes **3D-VAE**, **rectified flow**, and **score condition**. The video quality is greatly improved. [[checkpoints]](#open-sora-12-model-weights) [[report]](/docs/report_03.md) [[arxiv]](https://arxiv.org/abs/2412.20404)
- **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces.
- **[2024.04.25]** We released **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]](#open-sora-11-model-weights) [[report]](/docs/report_02.md)
- **[2024.03.18]** We released **Open-Sora 1.0**, a fully open-source project for video generation.
Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with
<a href="https://github.com/hpcaitech/ColossalAI"><img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/colossal_ai.png" width="8%" ></a>
acceleration,
inference, and more. Our model can produce 2s 512x512 videos with only 3 days training. [[checkpoints]](#open-sora-10-model-weights)
[[blog]](https://hpc-ai.com/blog/open-sora-v1.0) [[report]](/docs/report_01.md)
- **[2024.03.04]** Open-Sora provides training with 46% cost reduction.
[[blog]](https://hpc-ai.com/blog/open-sora)
📍 Since Open-Sora is under active development, we remain different branches for different versions. The latest version is [main](https://github.com/hpcaitech/Open-Sora). Old versions include: [v1.0](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.0), [v1.1](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.1), [v1.2](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.2), [v1.3](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.3).
## 🎥 Latest Demo
Demos are presented in compressed GIF format for convenience. For original quality samples and their corresponding prompts, please visit our [Gallery](https://hpcaitech.github.io/Open-Sora/).
| **5s 1024×576** | **5s 576×1024** | **5s 576×1024** |
| -------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/ft_0001_1_1.gif" width="">](https://streamable.com/e/8g9y9h?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0160.gif" width="">](https://streamable.com/e/k50mnv?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0017.gif" width="">](https://streamable.com/e/bzrn9n?autoplay=1) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/ft_0012_1_1.gif" width="">](https://streamable.com/e/dsv8da?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/douyin_0005.gif" width="">](https://streamable.com/e/3wif07?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0037.gif" width="">](https://streamable.com/e/us2w7h?autoplay=1) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/ft_0055_1_1.gif" width="">](https://streamable.com/e/yfwk8i?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/sora_0019.gif" width="">](https://streamable.com/e/jgjil0?autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/movie_0463.gif" width="">](https://streamable.com/e/lsoai1?autoplay=1) |
<details>
<summary>OpenSora 1.3 Demo</summary>
| **5s 720×1280** | **5s 720×1280** | **5s 720×1280** |
| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_tomato.gif" width="">](https://streamable.com/e/r0imrp?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_fisherman.gif" width="">](https://streamable.com/e/hfvjkh?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_girl2.gif" width="">](https://streamable.com/e/kutmma?quality=highest&amp;autoplay=1) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_grape.gif" width="">](https://streamable.com/e/osn1la?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_mushroom.gif" width="">](https://streamable.com/e/l1pzws?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_parrot.gif" width="">](https://streamable.com/e/2vqari?quality=highest&amp;autoplay=1) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_trans.gif" width="">](https://streamable.com/e/1in7d6?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_bear.gif" width="">](https://streamable.com/e/e9bi4o?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_futureflower.gif" width="">](https://streamable.com/e/09z7xi?quality=highest&amp;autoplay=1) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_fire.gif" width="">](https://streamable.com/e/16c3hk?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_man.gif" width="">](https://streamable.com/e/wi250w?quality=highest&amp;autoplay=1) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.3/demo_black.gif" width="">](https://streamable.com/e/vw5b64?quality=highest&amp;autoplay=1) |
</details>
<details>
<summary>OpenSora 1.2 Demo</summary>
| **4s 720×1280** | **4s 720×1280** | **4s 720×1280** |
| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0013.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/7895aab6-ed23-488c-8486-091480c26327) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_1718.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/20f07c7b-182b-4562-bbee-f1df74c86c9a) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0087.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3d897e0d-dc21-453a-b911-b3bda838acc2) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0052.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/644bf938-96ce-44aa-b797-b3c0b513d64c) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_1719.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/272d88ac-4b4a-484d-a665-8d07431671d0) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0002.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ebbac621-c34e-4bb4-9543-1c34f8989764) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0011.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/a1e3a1a3-4abd-45f5-8df2-6cced69da4ca) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0004.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/d6ce9c13-28e1-4dff-9644-cc01f5f11926) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.2/sample_0061.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/561978f8-f1b0-4f4d-ae7b-45bec9001b4a) |
</details>
<details>
<summary>OpenSora 1.1 Demo</summary>
| **2s 240×426** | **2s 240×426** |
| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |
| **2s 426×240** | **4s 480×854** |
| -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |
| **16s 320×320** | **16s 224×448** | **2s 426×240** |
| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.1/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
</details>
<details>
<summary>OpenSora 1.0 Demo</summary>
| **2s 512×512** | **2s 512×512** | **2s 512×512** |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) |
| A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. |
| [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v1.0/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) |
| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...] | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...] | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...] |
Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display,
see [here](/assets/texts/t2v_samples.txt) for full prompts.
</details>
## 🔆 Reports
- **[Tech Report of Open-Sora 2.0](https://arxiv.org/abs/2503.09642v1)**
- **[Step by step to train or finetune your own model](docs/train.md)**
- **[Step by step to train and evaluate an video autoencoder](docs/ae.md)**
- **[Visit the high compression video autoencoder](docs/hcae.md)**
- Reports of previous version (better see in according branch):
- [Open-Sora 1.3](docs/report_04.md): shift-window attention, unified spatial-temporal VAE, etc.
- [Open-Sora 1.2](docs/report_03.md), [Tech Report](https://arxiv.org/abs/2412.20404): rectified flow, 3d-VAE, score condition, evaluation, etc.
- [Open-Sora 1.1](docs/report_02.md): multi-resolution/length/aspect-ratio, image/video conditioning/editing, data preprocessing, etc.
- [Open-Sora 1.0](docs/report_01.md): architecture, captioning, etc.
📍 Since Open-Sora is under active development, we remain different branches for different versions. The latest version is [main](https://github.com/hpcaitech/Open-Sora). Old versions include: [v1.0](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.0), [v1.1](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.1), [v1.2](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.2), [v1.3](https://github.com/hpcaitech/Open-Sora/tree/opensora/v1.3).
## Quickstart
### Installation
```bash
# create a virtual env and activate (conda as an example)
conda create -n opensora python=3.10
conda activate opensora
# download the repo
git clone https://github.com/hpcaitech/Open-Sora
cd Open-Sora
# Ensure torch >= 2.4.0
pip install -v . # for development mode, `pip install -v -e .`
pip install xformers==0.0.27.post2 --index-url https://download.pytorch.org/whl/cu121 # install xformers according to your cuda version
pip install flash-attn --no-build-isolation
```
Optionally, you can install flash attention 3 for faster speed.
```bash
git clone https://github.com/Dao-AILab/flash-attention # 4f0640d5
cd flash-attention/hopper
python setup.py install
```
### Model Download
Our 11B model supports 256px and 768px resolution. Both T2V and I2V are supported by one model. 🤗 [Huggingface](https://huggingface.co/hpcai-tech/Open-Sora-v2) 🤖 [ModelScope](https://modelscope.cn/models/luchentech/Open-Sora-v2).
Download from huggingface:
```bash
pip install "huggingface_hub[cli]"
huggingface-cli download hpcai-tech/Open-Sora-v2 --local-dir ./ckpts
```
Download from ModelScope:
```bash
pip install modelscope
modelscope download hpcai-tech/Open-Sora-v2 --local_dir ./ckpts
```
### Text-to-Video Generation
Our model is optimized for image-to-video generation, but it can also be used for text-to-video generation. To generate high quality videos, with the help of flux text-to-image model, we build a text-to-image-to-video pipeline. For 256x256 resolution:
```bash
# Generate one given prompt
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea"
# Save memory with offloading
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --offload True
# Generation with csv
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --dataset.data-path assets/texts/example.csv
```
For 768x768 resolution:
```bash
# One GPU
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_768px.py --save-dir samples --prompt "raining, sea"
# Multi-GPU with colossalai sp
torchrun --nproc_per_node 8 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_768px.py --save-dir samples --prompt "raining, sea"
```
You can adjust the generation aspect ratio by `--aspect_ratio` and the generation length by `--num_frames`. Candidate values for aspect_ratio includes `16:9`, `9:16`, `1:1`, `2.39:1`. Candidate values for num_frames should be `4k+1` and less than 129.
You can also run direct text-to-video by:
```bash
# One GPU for 256px
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/256px.py --prompt "raining, sea"
# Multi-GPU for 768px
torchrun --nproc_per_node 8 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px.py --prompt "raining, sea"
```
### Image-to-Video Generation
Given a prompt and a reference image, you can generate a video with the following command:
```bash
# 256px
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/256px.py --cond_type i2v_head --prompt "A plump pig wallows in a muddy pond on a rustic farm, its pink snout poking out as it snorts contentedly. The camera captures the pig's playful splashes, sending ripples through the water under the midday sun. Wooden fences and a red barn stand in the background, framed by rolling green hills. The pig's muddy coat glistens in the sunlight, showcasing the simple pleasures of its carefree life." --ref assets/texts/i2v.png
# 256px with csv
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/256px.py --cond_type i2v_head --dataset.data-path assets/texts/i2v.csv
# Multi-GPU 768px
torchrun --nproc_per_node 8 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px.py --cond_type i2v_head --dataset.data-path assets/texts/i2v.csv
```
## Advanced Usage
### Motion Score
During training, we provide motion score into the text prompt. During inference, you can use the following command to generate videos with motion score (the default score is 4):
```bash
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --motion-score 4
```
We also provide a dynamic motion score evaluator. After setting your OpenAI API key, you can use the following command to evaluate the motion score of a video:
```bash
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --motion-score dynamic
```
| Score | 1 | 4 | 7 |
| ----- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------- |
| | <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/motion_score_1.gif" width=""> | <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/motion_score_4.gif" width=""> | <img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/demo/v2.0/motion_score_7.gif" width=""> |
### Prompt Refine
We take advantage of ChatGPT to refine the prompt. You can use the following command to refine the prompt. The function is available for both text-to-video and image-to-video generation.
```bash
export OPENAI_API_KEY=sk-xxxx
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
```
### Reproductivity
To make the results reproducible, you can set the random seed by:
```bash
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --sampling_option.seed 42 --seed 42
```
Use `--num-sample k` to generate `k` samples for each prompt.
## Computational Efficiency
We test the computational efficiency of text-to-video on H100/H800 GPU. For 256x256, we use colossalai's tensor parallelism, and `--offload True` is used. For 768x768, we use colossalai's sequence parallelism. All use number of steps 50. The results are presented in the format: $\color{blue}{\text{Total time (s)}}/\color{red}{\text{peak GPU memory (GB)}}$
| Resolution | 1x GPU | 2x GPUs | 4x GPUs | 8x GPUs |
| ---------- | -------------------------------------- | ------------------------------------- | ------------------------------------- | ------------------------------------- |
| 256x256 | $\color{blue}{60}/\color{red}{52.5}$ | $\color{blue}{40}/\color{red}{44.3}$ | $\color{blue}{34}/\color{red}{44.3}$ | |
| 768x768 | $\color{blue}{1656}/\color{red}{60.3}$ | $\color{blue}{863}/\color{red}{48.3}$ | $\color{blue}{466}/\color{red}{44.3}$ | $\color{blue}{276}/\color{red}{44.3}$ |
## Evaluation
On [VBench](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard), Open-Sora 2.0 significantly narrows the gap with OpenAIs Sora, reducing it from 4.52% → 0.69% compared to Open-Sora 1.2.
![VBench](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/v2_vbench.png)
Human preference results show our model is on par with HunyuanVideo 11B and Step-Video 30B.
![Win Rate](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/v2_winrate.png)
With strong performance, Open-Sora 2.0 is cost-effective.
![Cost](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/v2_cost.png)
## Contribution
Thanks goes to these wonderful contributors:
<a href="https://github.com/hpcaitech/Open-Sora/graphs/contributors">
<img src="https://contrib.rocks/image?repo=hpcaitech/Open-Sora" />
</a>
If you wish to contribute to this project, please refer to the [Contribution Guideline](./CONTRIBUTING.md).
## Acknowledgement
Here we only list a few of the projects. For other works and datasets, please refer to our report.
- [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization
system.
- [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
- [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration
strategies for training progress from OpenDiT.
- [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model.
- [Flux](https://github.com/black-forest-labs/flux): A powerful text-to-image generation model.
- [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video.
- [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/tree/main?tab=readme-ov-file): Open-Source text-to-video model.
- [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
- [DC-AE](https://github.com/mit-han-lab/efficientvit): Deep Compression AutoEncoder for image compression.
- [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
- [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
- [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
- [PLLaVA](https://github.com/magic-research/PLLaVA): A powerful video captioning model.
- [MiraData](https://github.com/mira-space/MiraData): A large-scale video dataset with long durations and structured caption.
## Citation
```bibtex
@article{opensora,
title={Open-sora: Democratizing efficient video production for all},
author={Zheng, Zangwei and Peng, Xiangyu and Yang, Tianji and Shen, Chenhui and Li, Shenggui and Liu, Hongxin and Zhou, Yukun and Li, Tianyi and You, Yang},
journal={arXiv preprint arXiv:2412.20404},
year={2024}
}
@article{opensora2,
title={Open-Sora 2.0: Training a Commercial-Level Video Generation Model in $200k},
author={Xiangyu Peng and Zangwei Zheng and Chenhui Shen and Tom Young and Xinying Guo and Binluo Wang and Hang Xu and Hongxin Liu and Mingyan Jiang and Wenjun Li and Yuhui Wang and Anbang Ye and Gang Ren and Qianran Ma and Wanying Liang and Xiang Lian and Xiwen Wu and Yuting Zhong and Zhuangyan Li and Chaoyu Gong and Guojun Lei and Leijun Cheng and Limin Zhang and Minghao Li and Ruijie Zhang and Silan Hu and Shijie Huang and Xiaokang Wang and Yuanheng Zhao and Yuqi Wang and Ziang Wei and Yang You},
year={2025},
journal={arXiv preprint arXiv:2503.09642},
}
```
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date)

9
assets/texts/example.csv Normal file
View File

@ -0,0 +1,9 @@
text
"Imagine a cyberpunk close-up shot capturing the upper body of a character with an melancholic demeanor. The subject is gesturing with one hand while shaking the head, showcasing natural body language. The background features a vibrant carnival, complementing the character's pose. The lighting is dim and moody, emphasizing the contours of their face and upper body. The camera subtly pans or zooms, drawing attention to the harmony between expression, posture, and setting."
"A sleek red sports car speeds through a winding mountain road, its engine roaring against the backdrop of towering snow-capped peaks. The sunlight glints off the polished surface, creating dazzling reflections. The camera pans to capture the lush greenery surrounding the road. The atmosphere is exhilarating, with a cinematic style emphasizing speed and adventure. The lighting is golden, suggesting early morning or late afternoon."
"A group of fluffy baby chicks huddle together under a heat lamp in a rustic barn. Their soft peeping fills the air as they nudge each other for warmth. The wooden floor beneath them is strewn with straw, and the gentle light creates a cozy, heartwarming atmosphere. The video captures their tiny, detailed movements in a close-up, realistic style."
"A black-and-white film of a pianist playing in an empty theater. His fingers move deftly across the keys, the music echoing in the large, empty hall. Dust motes float in the air, caught in the faint light streaming through the high windows. The grand piano gleams under the spotlight, contrasting with the decaying seats and peeling walls. The atmosphere is haunting and nostalgic."
"A wave of glowing steam crashes into a stone wall, the vapor hissing and swirling as it dissipates."
"A tomato surfing on a piece of lettuce down a waterfall of ranch dressing, with exaggerated surfing moves and creamy wave effects to highlight the 3D animated fun."
"A cheerful panda on a bustling city street, casually playing a violin while sitting on a bench. People passing by stop to enjoy the impromptu performance, and a group of children dance around, clapping their hands to the upbeat tempo. The pandas paws move swiftly, creating a lively tune that brings a sense of joy and energy to the urban scene."
"A shimmering, crystalline city built into the side of a massive mountain on a distant planet. Waterfalls of liquid light cascade down the cliffs, with hovering bridges connecting the structures. The entire city glows as it absorbs energy from the planets core."
1 text
2 Imagine a cyberpunk close-up shot capturing the upper body of a character with an melancholic demeanor. The subject is gesturing with one hand while shaking the head, showcasing natural body language. The background features a vibrant carnival, complementing the character's pose. The lighting is dim and moody, emphasizing the contours of their face and upper body. The camera subtly pans or zooms, drawing attention to the harmony between expression, posture, and setting.
3 A sleek red sports car speeds through a winding mountain road, its engine roaring against the backdrop of towering snow-capped peaks. The sunlight glints off the polished surface, creating dazzling reflections. The camera pans to capture the lush greenery surrounding the road. The atmosphere is exhilarating, with a cinematic style emphasizing speed and adventure. The lighting is golden, suggesting early morning or late afternoon.
4 A group of fluffy baby chicks huddle together under a heat lamp in a rustic barn. Their soft peeping fills the air as they nudge each other for warmth. The wooden floor beneath them is strewn with straw, and the gentle light creates a cozy, heartwarming atmosphere. The video captures their tiny, detailed movements in a close-up, realistic style.
5 A black-and-white film of a pianist playing in an empty theater. His fingers move deftly across the keys, the music echoing in the large, empty hall. Dust motes float in the air, caught in the faint light streaming through the high windows. The grand piano gleams under the spotlight, contrasting with the decaying seats and peeling walls. The atmosphere is haunting and nostalgic.
6 A wave of glowing steam crashes into a stone wall, the vapor hissing and swirling as it dissipates.
7 A tomato surfing on a piece of lettuce down a waterfall of ranch dressing, with exaggerated surfing moves and creamy wave effects to highlight the 3D animated fun.
8 A cheerful panda on a bustling city street, casually playing a violin while sitting on a bench. People passing by stop to enjoy the impromptu performance, and a group of children dance around, clapping their hands to the upbeat tempo. The panda’s paws move swiftly, creating a lively tune that brings a sense of joy and energy to the urban scene.
9 A shimmering, crystalline city built into the side of a massive mountain on a distant planet. Waterfalls of liquid light cascade down the cliffs, with hovering bridges connecting the structures. The entire city glows as it absorbs energy from the planet’s core.

2
assets/texts/i2v.csv Normal file
View File

@ -0,0 +1,2 @@
text,ref
"A plump pig wallows in a muddy pond on a rustic farm, its pink snout poking out as it snorts contentedly. The camera captures the pig's playful splashes, sending ripples through the water under the midday sun. Wooden fences and a red barn stand in the background, framed by rolling green hills. The pig's muddy coat glistens in the sunlight, showcasing the simple pleasures of its carefree life.",assets/texts/i2v.png
1 text ref
2 A plump pig wallows in a muddy pond on a rustic farm, its pink snout poking out as it snorts contentedly. The camera captures the pig's playful splashes, sending ripples through the water under the midday sun. Wooden fences and a red barn stand in the background, framed by rolling green hills. The pig's muddy coat glistens in the sunlight, showcasing the simple pleasures of its carefree life. assets/texts/i2v.png

BIN
assets/texts/i2v.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 746 KiB

49
assets/texts/sora.csv Normal file
View File

@ -0,0 +1,49 @@
text
"A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about."
"Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field."
"A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors."
"Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway."
"Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle. The art style is 3D and realistic, with a focus on lighting and texture. The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time. The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image."
"A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures."
"This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird's head is tilted slightly to the side, giving the impression of it looking regal and majestic. The background is blurred, drawing attention to the bird's striking appearance."
Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
"A young man at his 20s is sitting on a piece of cloud in the sky, reading a book."
Historical footage of California during the gold rush.
A close up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
"Extreme close up of a 24 year old woman's eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70mm, depth of field, vivid colors, cinematic"
A cartoon kangaroo disco dances.
"A beautiful homemade video showing the people of Lagos, Nigeria in the year 2056. Shot with a mobile phone camera."
A petri dish with a bamboo forest growing within it that has tiny red pandas running around.
"The camera rotates around a large stack of vintage televisions all showing different programs — 1950s sci-fi movies, horror movies, news, static, a 1970s sitcom, etc, set inside a large New York museum gallery."
"3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest."
"The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it's tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds."
Reflections in the window of a train traveling through the Tokyo suburbs.
"A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast, the view showcases historic and magnificent architectural details and tiered pathways and patios, waves are seen crashing against the rocks below as the view overlooks the horizon of the coastal waters and hilly landscapes of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a magical and romantic feeling to the scene, the view is stunning captured with beautiful photography."
"A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. Its tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock, its claws raised and ready to attack. The crab is brown and spiny, with long legs and antennae. The scene is captured from a wide angle, showing the vastness and depth of the ocean. The water is clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred, creating a depth of field effect."
"A flock of paper airplanes flutters through a dense jungle, weaving around trees as if they were migrating birds."
"A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer."
Borneo wildlife on the Kinabatangan River
A Chinese Lunar New Year celebration video with Chinese Dragon.
Tour of an art gallery with many beautiful works of art in different styles.
"Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes."
A stop motion animation of a flower growing out of the windowsill of a suburban house.
The story of a robot's life in a cyberpunk setting.
"An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film."
"A beautiful silhouette animation shows a wolf howling at the moon, feeling lonely, until it finds its pack."
"New York City submerged like Atlantis. Fish, whales, sea turtles and sharks swim through the streets of New York."
"A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in."
"Step-printing scene of a person running, cinematic film shot in 35mm."
"Five gray wolf pups frolicking and chasing each other around a remote gravel road, surrounded by grass. The pups run and leap, chasing each other, and nipping at each other, playing."
Basketball through hoop then explodes.
"Archeologists discover a generic plastic chair in the desert, excavating and dusting it with great care."
"A grandmother with neatly combed grey hair stands behind a colorful birthday cake with numerous candles at a wood dining room table, expression is one of pure joy and happiness, with a happy glow in her eye. She leans forward and blows out the candles with a gentle puff, the cake has pink frosting and sprinkles and the candles cease to flicker, the grandmother wears a light blue blouse adorned with floral patterns, several happy friends and family sitting at the table can be seen celebrating, out of focus. The scene is beautifully captured, cinematic, showing a 3/4 view of the grandmother and the dining room. Warm color tones and soft lighting enhance the mood."
The camera directly faces colorful buildings in Burano Italy. An adorable dalmation looks through a window on a building on the ground floor. Many people are walking and cycling along the canal streets in front of the buildings.
"An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush tropical islands, 3D digital render art style."
"This close-up shot of a chameleon showcases its striking color changing capabilities. The background is blurred, drawing attention to the animal's striking appearance."
A corgi vlogging itself in tropical Maui.
"A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates a warm contrast, accentuating the cat's orange fur. The shot is clear and sharp, with a shallow depth of field."
"Aerial view of Santorini during the blue hour, showcasing the stunning architecture of white Cycladic buildings with blue domes. The caldera views are breathtaking, and the lighting creates a beautiful, serene atmosphere."
"Tiltshift of a construction site filled with workers, equipment, and heavy machinery."
"A giant, towering cloud in the shape of a man looms over the earth. The cloud man shoots lighting bolts down to the earth."
A Samoyed and a Golden Retriever dog are playfully romping through a futuristic neon city at night. The neon lights emitted from the nearby buildings glistens off of their fur.
"The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort William. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop for the train journey. The sky is blue and the sun is shining, making for a beautiful day to explore this majestic spot."
1 text
2 A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
3 Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
4 A movie trailer featuring the adventures of the 30 year old space man wearing a red wool knitted motorcycle helmet, blue sky, salt desert, cinematic style, shot on 35mm film, vivid colors.
5 Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.
6 Animated scene features a close-up of a short fluffy monster kneeling beside a melting red candle. The art style is 3D and realistic, with a focus on lighting and texture. The mood of the painting is one of wonder and curiosity, as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time. The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image.
7 A gorgeously rendered papercraft world of a coral reef, rife with colorful fish and sea creatures.
8 This close-up shot of a Victoria crowned pigeon showcases its striking blue plumage and red chest. Its crest is made of delicate, lacy feathers, while its eye is a striking red color. The bird's head is tilted slightly to the side, giving the impression of it looking regal and majestic. The background is blurred, drawing attention to the bird's striking appearance.
9 Photorealistic closeup video of two pirate ships battling each other as they sail inside a cup of coffee.
10 A young man at his 20s is sitting on a piece of cloud in the sky, reading a book.
11 Historical footage of California during the gold rush.
12 A close up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
13 Extreme close up of a 24 year old woman's eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70mm, depth of field, vivid colors, cinematic
14 A cartoon kangaroo disco dances.
15 A beautiful homemade video showing the people of Lagos, Nigeria in the year 2056. Shot with a mobile phone camera.
16 A petri dish with a bamboo forest growing within it that has tiny red pandas running around.
17 The camera rotates around a large stack of vintage televisions all showing different programs — 1950s sci-fi movies, horror movies, news, static, a 1970s sitcom, etc, set inside a large New York museum gallery.
18 3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.
19 The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a steep mountain slope, dust kicks up from it's tires, the sunlight shines on the SUV as it speeds along the dirt road, casting a warm glow over the scene. The dirt road curves gently into the distance, with no other cars or vehicles in sight. The trees on either side of the road are redwoods, with patches of greenery scattered throughout. The car is seen from the rear following the curve with ease, making it seem as if it is on a rugged drive through the rugged terrain. The dirt road itself is surrounded by steep hills and mountains, with a clear blue sky above with wispy clouds.
20 Reflections in the window of a train traveling through the Tokyo suburbs.
21 A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast, the view showcases historic and magnificent architectural details and tiered pathways and patios, waves are seen crashing against the rocks below as the view overlooks the horizon of the coastal waters and hilly landscapes of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.
22 A large orange octopus is seen resting on the bottom of the ocean floor, blending in with the sandy and rocky terrain. Its tentacles are spread out around its body, and its eyes are closed. The octopus is unaware of a king crab that is crawling towards it from behind a rock, its claws raised and ready to attack. The crab is brown and spiny, with long legs and antennae. The scene is captured from a wide angle, showing the vastness and depth of the ocean. The water is clear and blue, with rays of sunlight filtering through. The shot is sharp and crisp, with a high dynamic range. The octopus and the crab are in focus, while the background is slightly blurred, creating a depth of field effect.
23 A flock of paper airplanes flutters through a dense jungle, weaving around trees as if they were migrating birds.
24 A cat waking up its sleeping owner demanding breakfast. The owner tries to ignore the cat, but the cat tries new tactics and finally the owner pulls out a secret stash of treats from under the pillow to hold the cat off a little longer.
25 Borneo wildlife on the Kinabatangan River
26 A Chinese Lunar New Year celebration video with Chinese Dragon.
27 Tour of an art gallery with many beautiful works of art in different styles.
28 Beautiful, snowy Tokyo city is bustling. The camera moves through the bustling city street, following several people enjoying the beautiful snowy weather and shopping at nearby stalls. Gorgeous sakura petals are flying through the wind along with snowflakes.
29 A stop motion animation of a flower growing out of the windowsill of a suburban house.
30 The story of a robot's life in a cyberpunk setting.
31 An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in thought pondering the history of the universe as he sits at a cafe in Paris, his eyes focus on people offscreen as they walk as he sits mostly motionless, he is dressed in a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses and has a very professorial appearance, and the end he offers a subtle closed-mouth smile as if he found the answer to the mystery of life, the lighting is very cinematic with the golden light and the Parisian streets and city in the background, depth of field, cinematic 35mm film.
32 A beautiful silhouette animation shows a wolf howling at the moon, feeling lonely, until it finds its pack.
33 New York City submerged like Atlantis. Fish, whales, sea turtles and sharks swim through the streets of New York.
34 A litter of golden retriever puppies playing in the snow. Their heads pop out of the snow, covered in.
35 Step-printing scene of a person running, cinematic film shot in 35mm.
36 Five gray wolf pups frolicking and chasing each other around a remote gravel road, surrounded by grass. The pups run and leap, chasing each other, and nipping at each other, playing.
37 Basketball through hoop then explodes.
38 Archeologists discover a generic plastic chair in the desert, excavating and dusting it with great care.
39 A grandmother with neatly combed grey hair stands behind a colorful birthday cake with numerous candles at a wood dining room table, expression is one of pure joy and happiness, with a happy glow in her eye. She leans forward and blows out the candles with a gentle puff, the cake has pink frosting and sprinkles and the candles cease to flicker, the grandmother wears a light blue blouse adorned with floral patterns, several happy friends and family sitting at the table can be seen celebrating, out of focus. The scene is beautifully captured, cinematic, showing a 3/4 view of the grandmother and the dining room. Warm color tones and soft lighting enhance the mood.
40 The camera directly faces colorful buildings in Burano Italy. An adorable dalmation looks through a window on a building on the ground floor. Many people are walking and cycling along the canal streets in front of the buildings.
41 An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush tropical islands, 3D digital render art style.
42 This close-up shot of a chameleon showcases its striking color changing capabilities. The background is blurred, drawing attention to the animal's striking appearance.
43 A corgi vlogging itself in tropical Maui.
44 A white and orange tabby cat is seen happily darting through a dense garden, as if chasing something. Its eyes are wide and happy as it jogs forward, scanning the branches, flowers, and leaves as it walks. The path is narrow as it makes its way between all the plants. the scene is captured from a ground-level angle, following the cat closely, giving a low and intimate perspective. The image is cinematic with warm tones and a grainy texture. The scattered daylight between the leaves and plants above creates a warm contrast, accentuating the cat's orange fur. The shot is clear and sharp, with a shallow depth of field.
45 Aerial view of Santorini during the blue hour, showcasing the stunning architecture of white Cycladic buildings with blue domes. The caldera views are breathtaking, and the lighting creates a beautiful, serene atmosphere.
46 Tiltshift of a construction site filled with workers, equipment, and heavy machinery.
47 A giant, towering cloud in the shape of a man looms over the earth. The cloud man shoots lighting bolts down to the earth.
48 A Samoyed and a Golden Retriever dog are playfully romping through a futuristic neon city at night. The neon lights emitted from the nearby buildings glistens off of their fur.
49 The Glenfinnan Viaduct is a historic railway bridge in Scotland, UK, that crosses over the west highland line between the towns of Mallaig and Fort William. It is a stunning sight as a steam train leaves the bridge, traveling over the arch-covered viaduct. The landscape is dotted with lush greenery and rocky mountains, creating a picturesque backdrop for the train journey. The sky is blue and the sun is shining, making for a beautiful day to explore this majestic spot.

View File

@ -0,0 +1,76 @@
save_dir = "samples" # save directory
seed = 42 # random seed (except seed for z)
batch_size = 1
dtype = "bf16"
cond_type = "t2v"
# conditional inference options:
# t2v: text-to-video
# i2v_head: image-to-video (head)
# i2v_tail: image-to-video (tail)
# i2v_loop: connect images
# v2v_head_half: video extension with first half
# v2v_tail_half: video extension with second half
dataset = dict(type="text")
sampling_option = dict(
resolution="256px", # 256px or 768px
aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
num_frames=129, # number of frames
num_steps=50, # number of steps
shift=True,
temporal_reduction=4,
is_causal_vae=True,
guidance=7.5, # guidance for text-to-video
guidance_img=3.0, # guidance for image-to-video
text_osci=True, # enable text guidance oscillation
image_osci=True, # enable image guidance oscillation
scale_temporal_osci=True,
method="i2v", # hard-coded for now
seed=None, # random seed for z
)
motion_score = "4" # motion score for video generation
fps_save = 24 # fps for video generation and saving
# Define model components
model = dict(
type="flux",
from_pretrained="./ckpts/Open_Sora_v2.safetensors",
guidance_embed=False,
fused_qkv=False,
use_liger_rope=True,
# model architecture
in_channels=64,
vec_in_dim=768,
context_in_dim=4096,
hidden_size=3072,
mlp_ratio=4.0,
num_heads=24,
depth=19,
depth_single_blocks=38,
axes_dim=[16, 56, 56],
theta=10_000,
qkv_bias=True,
cond_embed=True,
)
ae = dict(
type="hunyuan_vae",
from_pretrained="./ckpts/hunyuan_vae.safetensors",
in_channels=3,
out_channels=3,
layers_per_block=2,
latent_channels=16,
use_spatial_tiling=True,
use_temporal_tiling=False,
)
t5 = dict(
type="text_embedder",
from_pretrained="./ckpts/google/t5-v1_1-xxl",
max_length=512,
shardformer=True,
)
clip = dict(
type="text_embedder",
from_pretrained="./ckpts/openai/clip-vit-large-patch14",
max_length=77,
)

View File

@ -0,0 +1,4 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/tp.py", # use tensor parallel
]

View File

@ -0,0 +1,8 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/sp.py", # use sequence parallel
]
sampling_option = dict(
resolution="768px",
)

View File

@ -0,0 +1,35 @@
_base_ = ["t2i2v_768px.py"]
# no need for parallelism
plugin = None
plugin_config = None
plugin_ae = None
plugin_config_ae = None
# model settings
patch_size = 1
model = dict(
from_pretrained="./ckpts/Open_Sora_v2_Video_DC_AE.safetensors",
in_channels=128,
cond_embed=True,
patch_size=1,
)
# AE settings
ae = dict(
_delete_=True,
type="dc_ae",
from_scratch=True,
model_name="dc-ae-f32t4c128",
from_pretrained="./ckpts/F32T4C128_AE.safetensors",
use_spatial_tiling=True,
use_temporal_tiling=True,
spatial_tile_size=256,
temporal_tile_size=32,
tile_overlap_factor=0.25,
)
ae_spatial_compression = 32
sampling_option = dict(
num_frames=128,
)

View File

@ -0,0 +1,20 @@
plugin = "hybrid"
plugin_config = dict(
tp_size=1,
pp_size=1,
sp_size=8,
sequence_parallelism_mode="ring_attn",
enable_sequence_parallelism=True,
static_graph=True,
zero_stage=2,
overlap_allgather=False,
)
plugin_ae = "hybrid"
plugin_config_ae = dict(
tp_size=8,
pp_size=1,
sp_size=1,
zero_stage=2,
overlap_allgather=False,
)

View File

@ -0,0 +1,36 @@
use_t2i2v = True
# flux configurations
img_flux = dict(
type="flux",
from_pretrained="./ckpts/flux1-dev.safetensors",
guidance_embed=True,
# model architecture
in_channels=64,
vec_in_dim=768,
context_in_dim=4096,
hidden_size=3072,
mlp_ratio=4.0,
num_heads=24,
depth=19,
depth_single_blocks=38,
axes_dim=[16, 56, 56],
theta=10_000,
qkv_bias=True,
cond_embed=False, # pass i2v & v2v info, for t2v need this layer too but with x_cond and mask all set to 0
)
img_flux_ae = dict(
type="autoencoder_2d",
from_pretrained="./ckpts/flux1-dev-ae.safetensors",
resolution=256,
in_channels=3,
ch=128,
out_ch=3,
ch_mult=[1, 2, 4, 4],
num_res_blocks=2,
z_channels=16,
scale_factor=0.3611,
shift_factor=0.1159,
)
img_resolution = "768px"

View File

@ -0,0 +1,17 @@
plugin = "hybrid"
plugin_config = dict(
tp_size=8,
pp_size=1,
sp_size=1,
zero_stage=2,
overlap_allgather=False,
)
plugin_ae = "hybrid"
plugin_config_ae = dict(
tp_size=8,
pp_size=1,
sp_size=1,
zero_stage=2,
overlap_allgather=False,
)

View File

@ -0,0 +1,4 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/t2i2v.py",
]

View File

@ -0,0 +1,4 @@
_base_ = [ # inherit grammer from mmengine
"768px.py",
"plugins/t2i2v.py",
]

View File

@ -0,0 +1,12 @@
_base_ = ["stage1.py"]
bucket_config = {
"_delete_": True,
"256px": {
1: (1.0, 1),
33: (1.0, 1),
97: (1.0, 1),
129: (1.0, 1),
},
}

View File

@ -0,0 +1,71 @@
_base_ = ["image.py"]
bucket_config = {
"_delete_": True,
"768px": {
1: (1.0, 20),
16: (1.0, 8),
20: (1.0, 8),
24: (1.0, 8),
28: (1.0, 8),
32: (1.0, 8),
36: (1.0, 4),
40: (1.0, 4),
44: (1.0, 4),
48: (1.0, 4),
52: (1.0, 4),
56: (1.0, 4),
60: (1.0, 4),
64: (1.0, 4),
68: (1.0, 3),
72: (1.0, 3),
76: (1.0, 3),
80: (1.0, 3),
84: (1.0, 3),
88: (1.0, 3),
92: (1.0, 3),
96: (1.0, 3),
100: (1.0, 2),
104: (1.0, 2),
108: (1.0, 2),
112: (1.0, 2),
116: (1.0, 2),
120: (1.0, 2),
124: (1.0, 2),
128: (1.0, 2), # 30s
},
}
condition_config = dict(
t2v=1,
i2v_head=7,
)
grad_ckpt_settings = (100, 100)
patch_size = 1
model = dict(
from_pretrained=None,
grad_ckpt_settings=grad_ckpt_settings,
in_channels=128,
cond_embed=True,
patch_size=patch_size,
)
ae = dict(
_delete_=True,
type="dc_ae",
model_name="dc-ae-f32t4c128",
from_pretrained="./ckpts/F32T4C128_AE.safetensors",
from_scratch=True,
scaling_factor=0.493,
use_spatial_tiling=True,
use_temporal_tiling=True,
spatial_tile_size=256,
temporal_tile_size=32,
tile_overlap_factor=0.25,
)
is_causal_vae = False
ae_spatial_compression = 32
ckpt_every = 250
lr = 3e-5
optim = dict(lr=lr)

View File

@ -0,0 +1,114 @@
# Dataset settings
dataset = dict(
type="video_text",
transform_name="resize_crop",
fps_max=24, # the desired fps for training
vmaf=True, # load vmaf scores into text
)
grad_ckpt_settings = (8, 100) # set the grad checkpoint settings
bucket_config = {
"256px": {1: (1.0, 50)},
"768px": {1: (0.5, 11)},
"1024px": {1: (0.5, 7)},
}
# Define model components
model = dict(
type="flux",
from_pretrained=None,
strict_load=False,
guidance_embed=False,
fused_qkv=False,
use_liger_rope=True,
grad_ckpt_settings=grad_ckpt_settings,
# model architecture
in_channels=64,
vec_in_dim=768,
context_in_dim=4096,
hidden_size=3072,
mlp_ratio=4.0,
num_heads=24,
depth=19,
depth_single_blocks=38,
axes_dim=[16, 56, 56],
theta=10_000,
qkv_bias=True,
)
dropout_ratio = { # probability for dropout text embedding
"t5": 0.31622777,
"clip": 0.31622777,
}
ae = dict(
type="hunyuan_vae",
from_pretrained="./ckpts/hunyuan_vae.safetensors",
in_channels=3,
out_channels=3,
layers_per_block=2,
latent_channels=16,
use_spatial_tiling=True,
use_temporal_tiling=False,
)
is_causal_vae = True
t5 = dict(
type="text_embedder",
from_pretrained="google/t5-v1_1-xxl",
cache_dir="/mnt/ddn/sora/tmp_load/huggingface/hub/",
max_length=512,
shardformer=True,
)
clip = dict(
type="text_embedder",
from_pretrained="openai/clip-vit-large-patch14",
cache_dir="/mnt/ddn/sora/tmp_load/huggingface/hub/",
max_length=77,
)
# Optimization settings
lr = 1e-5
eps = 1e-15
optim = dict(
cls="HybridAdam",
lr=lr,
eps=eps,
weight_decay=0.0,
adamw_mode=True,
)
warmup_steps = 0
update_warmup_steps = True
grad_clip = 1.0
accumulation_steps = 1
ema_decay = None
# Acceleration settings
prefetch_factor = 2
num_workers = 12
num_bucket_build_workers = 64
dtype = "bf16"
plugin = "zero2"
grad_checkpoint = True
plugin_config = dict(
reduce_bucket_size_in_m=128,
overlap_allgather=False,
)
pin_memory_cache_pre_alloc_numels = [(260 + 20) * 1024 * 1024] * 24 + [
(34 + 20) * 1024 * 1024
] * 4
async_io = False
# Other settings
seed = 42
outputs = "outputs"
epochs = 1000
log_every = 10
ckpt_every = 100
keep_n_latest = 20
wandb_project = "mmdit"
save_master_weights = True
load_master_weights = True
# For debugging
# record_time = True
# record_barrier = True

View File

@ -0,0 +1,56 @@
_base_ = ["image.py"]
dataset = dict(memory_efficient=False)
# new config
grad_ckpt_settings = (8, 100)
bucket_config = {
"_delete_": True,
"256px": {
1: (1.0, 45),
5: (1.0, 12),
9: (1.0, 12),
13: (1.0, 12),
17: (1.0, 12),
21: (1.0, 12),
25: (1.0, 12),
29: (1.0, 12),
33: (1.0, 12),
37: (1.0, 6),
41: (1.0, 6),
45: (1.0, 6),
49: (1.0, 6),
53: (1.0, 6),
57: (1.0, 6),
61: (1.0, 6),
65: (1.0, 6),
69: (1.0, 4),
73: (1.0, 4),
77: (1.0, 4),
81: (1.0, 4),
85: (1.0, 4),
89: (1.0, 4),
93: (1.0, 4),
97: (1.0, 4),
101: (1.0, 3),
105: (1.0, 3),
109: (1.0, 3),
113: (1.0, 3),
117: (1.0, 3),
121: (1.0, 3),
125: (1.0, 3),
129: (1.0, 3),
},
"768px": {
1: (0.5, 13),
},
"1024px": {
1: (0.5, 7),
},
}
model = dict(grad_ckpt_settings=grad_ckpt_settings)
lr = 5e-5
optim = dict(lr=lr)
ckpt_every = 2000
keep_n_latest = 20

View File

@ -0,0 +1,14 @@
_base_ = ["stage1.py"]
# Define model components
model = dict(cond_embed=True)
condition_config = dict(
t2v=1,
i2v_head=5, # train i2v (image as first frame) with weight 5
i2v_loop=1, # train image connection with weight 1
i2v_tail=1, # train i2v (image as last frame) with weight 1
)
lr = 1e-5
optim = dict(lr=lr)

View File

@ -0,0 +1,94 @@
_base_ = ["image.py"]
# new config
grad_ckpt_settings = (100, 100)
plugin = "hybrid"
plugin_config = dict(
tp_size=1,
pp_size=1,
sp_size=4,
sequence_parallelism_mode="ring_attn",
enable_sequence_parallelism=True,
static_graph=True,
zero_stage=2,
)
bucket_config = {
"_delete_": True,
"256px": {
1: (1.0, 130),
5: (1.0, 14),
9: (1.0, 14),
13: (1.0, 14),
17: (1.0, 14),
21: (1.0, 14),
25: (1.0, 14),
29: (1.0, 14),
33: (1.0, 14),
37: (1.0, 10),
41: (1.0, 10),
45: (1.0, 10),
49: (1.0, 10),
53: (1.0, 10),
57: (1.0, 10),
61: (1.0, 10),
65: (1.0, 10),
73: (1.0, 7),
77: (1.0, 7),
81: (1.0, 7),
85: (1.0, 7),
89: (1.0, 7),
93: (1.0, 7),
97: (1.0, 7),
101: (1.0, 6),
105: (1.0, 6),
109: (1.0, 6),
113: (1.0, 6),
117: (1.0, 6),
121: (1.0, 6),
125: (1.0, 6),
129: (1.0, 6),
},
"768px": {
1: (1.0, 38),
5: (1.0, 6),
9: (1.0, 6),
13: (1.0, 6),
17: (1.0, 6),
21: (1.0, 6),
25: (1.0, 6),
29: (1.0, 6),
33: (1.0, 6),
37: (1.0, 4),
41: (1.0, 4),
45: (1.0, 4),
49: (1.0, 4),
53: (1.0, 4),
57: (1.0, 4),
61: (1.0, 4),
65: (1.0, 4),
69: (1.0, 3),
73: (1.0, 3),
77: (1.0, 3),
81: (1.0, 3),
85: (1.0, 3),
89: (1.0, 3),
93: (1.0, 3),
97: (1.0, 3),
101: (1.0, 2),
105: (1.0, 2),
109: (1.0, 2),
113: (1.0, 2),
117: (1.0, 2),
121: (1.0, 2),
125: (1.0, 2),
129: (1.0, 2),
},
}
model = dict(grad_ckpt_settings=grad_ckpt_settings)
lr = 5e-5
optim = dict(lr=lr)
ckpt_every = 200
keep_n_latest = 20

View File

@ -0,0 +1,87 @@
_base_ = ["stage2.py"]
# Define model components
model = dict(cond_embed=True)
grad_ckpt_buffer_size = 25 * 1024**3
condition_config = dict(
t2v=1,
i2v_head=5,
i2v_loop=1,
i2v_tail=1,
)
is_causal_vae = True
bucket_config = {
"_delete_": True,
"256px": {
1: (1.0, 195),
5: (1.0, 80),
9: (1.0, 80),
13: (1.0, 80),
17: (1.0, 80),
21: (1.0, 80),
25: (1.0, 80),
29: (1.0, 80),
33: (1.0, 80),
37: (1.0, 40),
41: (1.0, 40),
45: (1.0, 40),
49: (1.0, 40),
53: (1.0, 40),
57: (1.0, 40),
61: (1.0, 40),
65: (1.0, 40),
69: (1.0, 28),
73: (1.0, 28),
77: (1.0, 28),
81: (1.0, 28),
85: (1.0, 28),
89: (1.0, 28),
93: (1.0, 28),
97: (1.0, 28),
101: (1.0, 23),
105: (1.0, 23),
109: (1.0, 23),
113: (1.0, 23),
117: (1.0, 23),
121: (1.0, 23),
125: (1.0, 23),
129: (1.0, 23),
},
"768px": {
1: (0.5, 38),
5: (0.5, 10),
9: (0.5, 10),
13: (0.5, 10),
17: (0.5, 10),
21: (0.5, 10),
25: (0.5, 10),
29: (0.5, 10),
33: (0.5, 10),
37: (0.5, 5),
41: (0.5, 5),
45: (0.5, 5),
49: (0.5, 5),
53: (0.5, 5),
57: (0.5, 5),
61: (0.5, 5),
65: (0.5, 5),
69: (0.5, 3),
73: (0.5, 3),
77: (0.5, 3),
81: (0.5, 3),
85: (0.5, 3),
89: (0.5, 3),
93: (0.5, 3),
97: (0.5, 3),
101: (0.5, 2),
105: (0.5, 2),
109: (0.5, 2),
113: (0.5, 2),
117: (0.5, 2),
121: (0.5, 2),
125: (0.5, 2),
129: (0.5, 2),
},
}

View File

@ -0,0 +1,33 @@
dtype = "bf16"
batch_size = 1
seed = 42
save_dir = "samples/hunyuanvideo_vae"
plugin = "zero2"
dataset = dict(
type="video_text",
transform_name="resize_crop",
fps_max=16,
data_path="datasets/pexels_45k_necessary.csv",
)
bucket_config = {
"512px_ar1:1": {97: (1.0, 1)},
}
num_workers = 24
num_bucket_build_workers = 16
prefetch_factor = 4
model = dict(
type="hunyuan_vae",
from_pretrained="./ckpts/hunyuan_vae.safetensors",
in_channels=3,
out_channels=3,
layers_per_block=2,
latent_channels=16,
scale_factor=0.476986,
shift_factor=0,
use_spatial_tiling=True,
use_temporal_tiling=True,
time_compression_ratio=4,
)

View File

@ -0,0 +1,32 @@
dtype = "bf16"
batch_size = 1
seed = 42
dataset = dict(
type="video_text",
transform_name="resize_crop",
fps_max=16,
data_path="datasets/pexels_45k_necessary.csv",
)
bucket_config = {
"512px_ar1:1": {96: (1.0, 1)},
}
model = dict(
type="dc_ae",
model_name="dc-ae-f32t4c128",
from_pretrained="./ckpts/F32T4C128_AE.safetensors",
from_scratch=True,
use_spatial_tiling=True,
use_temporal_tiling=True,
spatial_tile_size=256,
temporal_tile_size=32,
tile_overlap_factor=0.25,
)
save_dir = "samples/video_dc_ae"
num_workers = 24
num_bucket_build_workers = 16
prefetch_factor = 4

View File

@ -0,0 +1,74 @@
# ============
# model config
# ============
model = dict(
type="dc_ae",
model_name="dc-ae-f32t4c128",
from_scratch=True,
from_pretrained=None,
)
# ============
# data config
# ============
dataset = dict(
type="video_text",
transform_name="resize_crop",
data_path="datasets/pexels_45k_necessary.csv",
fps_max=24,
)
bucket_config = {
"256px_ar1:1": {32: (1.0, 1)},
}
num_bucket_build_workers = 64
num_workers = 12
prefetch_factor = 2
# ============
# train config
# ============
optim = dict(
cls="HybridAdam",
lr=5e-5,
eps=1e-8,
weight_decay=0.0,
adamw_mode=True,
betas=(0.9, 0.98),
)
lr_scheduler = dict(warmup_steps=0)
mixed_strategy = "mixed_video_image"
mixed_image_ratio = 0.2 # 1:4
dtype = "bf16"
plugin = "zero2"
plugin_config = dict(
reduce_bucket_size_in_m=128,
overlap_allgather=False,
)
grad_clip = 1.0
grad_checkpoint = False
pin_memory_cache_pre_alloc_numels = [50 * 1024 * 1024] * num_workers * prefetch_factor
seed = 42
outputs = "outputs"
epochs = 100
log_every = 10
ckpt_every = 3000
keep_n_latest = 50
ema_decay = 0.99
wandb_project = "dcae"
update_warmup_steps = True
# ============
# loss config
# ============
vae_loss_config = dict(
perceptual_loss_weight=0.5,
kl_loss_weight=0,
)

View File

@ -0,0 +1,34 @@
_base_ = ["video_dc_ae.py"]
discriminator = dict(
type="N_Layer_discriminator_3D",
from_pretrained=None,
input_nc=3,
n_layers=5,
conv_cls="conv3d"
)
disc_lr_scheduler = dict(warmup_steps=0)
gen_loss_config = dict(
gen_start=0,
disc_weight=0.05,
)
disc_loss_config = dict(
disc_start=0,
disc_loss_type="hinge",
)
optim_discriminator = dict(
cls="HybridAdam",
lr=1e-4,
eps=1e-8,
weight_decay=0.0,
adamw_mode=True,
betas=(0.9, 0.98),
)
grad_checkpoint = True
model = dict(
disc_off_grad_ckpt = True, # set to true if your `grad_checkpoint` is True
)

154
docs/ae.md Normal file
View File

@ -0,0 +1,154 @@
# Step by step to train and evaluate an video autoencoder (AE)
Inspired by [SANA](https://arxiv.org/abs/2410.10629), we aim to drastically increase the compression ratio in the AE. We propose a video autoencoder architecture based on [DC-AE](https://github.com/mit-han-lab/efficientvit), the __Video DC-AE__, which compression the video by 4x in the temporal dimension and 32x32 in the spatial dimension. Compared to [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)'s VAE of 4x8x8, our proposed AE has a much higher spatial compression ratio.
Thus, we can effectively reduce the token length in the diffusion model by a total of 16x (assuming the same patch sizes), drastically increase both training and inference speed.
## Data Preparation
Follow this [guide](./train.md#prepare-dataset) to prepare the __DATASET__ for training and inference. You may use our provided dataset or custom ones.
To use custom dataset, pass the argument `--dataset.data_path <your_data_path>` to the following training or inference command.
## Training
We train our __Video DC-AE__ from scratch on 8xGPUs for 3 weeks.
We first train with the following command:
```bash
torchrun --nproc_per_node 8 scripts/vae/train.py configs/vae/train/video_dc_ae.py
```
When the model is almost converged, we add a discriminator and continue to train the model with the checkpoint `model_ckpt` using the following command:
```bash
torchrun --nproc_per_node 8 scripts/vae/train.py configs/vae/train/video_dc_ae_disc.py --model.from_pretrained <model_ckpt>
```
You may pass the flag `--wandb True` if you have a [wandb](https://wandb.ai/home) account and wish to track the training progress online.
## Inference
Download the relevant weights following [this guide](../README.md#model-download). Alternatively, you may use your own trained model by passing the following flag `--model.from_pretrained <your_model_ckpt_path>`.
### Video DC-AE
Use the following code to reconstruct the videos using our trained `Video DC-AE`:
```bash
torchrun --nproc_per_node 1 --standalone scripts/vae/inference.py configs/vae/inference/video_dc_ae.py --save-dir samples/dcae
```
### Hunyuan Video
Alternatively, we have incorporated [HunyuanVideo vae](https://github.com/Tencent/HunyuanVideo) into our code, you may run inference with the following command:
```bash
torchrun --nproc_per_node 1 --standalone scripts/vae/inference.py configs/vae/inference/hunyuanvideo_vae.py --save-dir samples/hunyuanvideo_vae
```
## Config Interpretation
All AE configs are located in `configs/vae/`, divided into configs for training (`configs/vae/train`) and for inference (`configs/vae/inference`).
### Training Config
For training, the same config rules as [those](./train.md#config) for the diffusion model are applied.
<details>
<summary> <b>Loss Config</b> </summary>
Our __Video DC-AE__ is based on the [DC-AE](https://github.com/mit-han-lab/efficientvit) architecture, which doesn't have a variational component. Thus, our training simply composes of the *reconstruction loss* and the *perceptual loss*.
Experimentally, we found that setting a ratio of 0.5 for the perceptual loss is effective.
```python
vae_loss_config = dict(
perceptual_loss_weight=0.5, # weigh the perceptual loss by 0.5
kl_loss_weight=0, # no KL loss
)
```
In a later stage, we include a discriminator, and the training loss for the ae has an additional generator loss component, where we use a small ratio of 0.05 to weigh the loss calculated:
```python
gen_loss_config = dict(
gen_start=0, # include generator loss from step 0 onwards
disc_weight=0.05, # weigh the loss by 0.05
)
```
The discriminator we use is trained from scratch, and it's loss is simply the hinged loss:
```python
disc_loss_config = dict(
disc_start=0, # update the discriminator from step 0 onwards
disc_loss_type="hinge", # the discriminator loss type
)
```
</details>
<details>
<summary> <b> Data Bucket Config </b> </summary>
For the data bucket, we used 32 frames of 256px videos to train our AE.
```python
bucket_config = {
"256px_ar1:1": {32: (1.0, 1)},
}
```
</details>
<details>
<summary> <b>Train with more frames or higher resolutions</b> </summary>
If you train with longer frames or larger resolutions, you may increase the `spatial_tile_size` and `temporal_tile_size` during inference without degrading the AE performance (see [Inference Config](ae.md#inference-config)). This may give you advantage of faster AE inference such as when training the diffusion model (although at the cost of slower AE training).
You may increase the video frames to 96 (although multiples of 4 works, we generally recommend to use frame numbers of multiples of 32):
```python
bucket_config = {
"256px_ar1:1": {96: (1.0, 1)},
}
grad_checkpoint = True
```
or train for higher resolution such as 512px:
```python
bucket_config = {
"512px_ar1:1": {32: (1.0, 1)},
}
grad_checkpoint = True
```
Note that gradient checkpoint needs to be turned on in order to avoid prevent OOM error.
Moreover, if `grad_checkpointing` is set to `True` in discriminator training, you need to pass the flag `--model.disc_off_grad_ckpt True` or simply set in the config:
```python
grad_checkpoint = True
model = dict(
disc_off_grad_ckpt = True, # set to true if your `grad_checkpoint` is True
)
```
This is to make sure the discriminator loss will have a gradient at the laster later during adaptive loss calculation.
</details>
### Inference Config
For AE inference, we have replicated the tiling mechanism in hunyuan to our Video DC-AE, which can be turned on with the following:
```python
model = dict(
...,
use_spatial_tiling=True,
use_temporal_tiling=True,
spatial_tile_size=256,
temporal_tile_size=32,
tile_overlap_factor=0.25,
...,
)
```
By default, both spatial tiling and temporal tiling are turned on for the best performance.
Since our Video DC-AE is trained on 256px videos of 32 frames only, `spatial_tile_size` should be set to 256 and `temporal_tile_size` should be set to 32.
If you train your own Video DC-AE with other resolutions and length, you may adjust the values accordingly.
You can specify the directory to store output samples with `--save_dir <your_dir>` or setting it in config, for instance:
```python
save_dir = "./samples"
```

38
docs/hcae.md Normal file
View File

@ -0,0 +1,38 @@
# 10× inference speedup with high-compression autoencoder
The high computational cost of training video generation models arises from the
large number of tokens and the dominance of attention computation. To further reduce training expenses,
we explore training video generation models with high-compression autoencoders (Video DC-AEs). As shown in the comparason below, by switching to the Video DC-AE with a much higher downsample ratio (4 x 32 x 32), we can afford to further reduce the patch size to 1 and still achieve __5.2× speedup in training throughput__ and __10x speedup during inference__:
![opensorav2_speed](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/hcae_opensorav2_speed.png)
Nevertheless, despite the advantanges in drastically lower computation costs, other challenges remain. For instance, larger channels low down convergance. Our generation model adapted with a 128-channel Video DC-AE for 25K iterations achieves a loss level of 0.5, as compared to 0.1 from the initialization model. While the fast video generation model underperforms the original, it still captures spatial-temporal
relationships. We release this model to the research community for further exploration.
Checkout more details in our [report](https://arxiv.org/abs/2503.09642v1).
## Model Download
Download from 🤗 [Huggingface](https://huggingface.co/hpcai-tech/Open-Sora-v2-Video-DC-AE):
```bash
pip install "huggingface_hub[cli]"
huggingface-cli download hpcai-tech/Open-Sora-v2-Video-DC-AE --local-dir ./ckpts
```
## Inference
To inference on our fast video generation model:
```bash
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/high_compression.py --prompt "The story of a robot's life in a cyberpunk setting."
```
## Training
Follow this [guide](./train.md#prepare-dataset) to parepare the __DATASET__ for training.
Then, you may train your own fast generation model with the following command:
```bash
torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/high_compression.py --dataset.data-path datasets/pexels_45k_necessary.csv
```

49
docs/report_01.md Normal file
View File

@ -0,0 +1,49 @@
# Open-Sora 1.0 Report
OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model.
## Efficiency in choosing the architecture
To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version.
The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte).
As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit).
![Architecture Comparison](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch_comp.png)
To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.
![Architecture](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch.jpg)
Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.
We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training.
## Data is the key to high quality
We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions.
![Caption](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_caption.png)
As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version.
## Training Details
With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works.
## Loss curves
16x256x256 Pretraining Loss Curve
![16x256x256 Pretraining Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_1.png)
16x256x256 HQ Training Loss Curve
![16x256x256 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_2.png)
16x512x512 HQ Training Loss Curve
![16x512x512 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_3.png)
> Core Contributor: Zangwei Zheng*, Xiangyu Peng*, Shenggui Li, Hongxing Liu, Yang You

117
docs/report_02.md Normal file
View File

@ -0,0 +1,117 @@
# Open-Sora 1.1 Report
- [Model Architecture Modification](#model-architecture-modification)
- [Support for Multi-time/resolution/aspect ratio/fps Training](#support-for-multi-timeresolutionaspect-ratiofps-training)
- [Masked DiT as Image/Video-to-Video Model](#masked-dit-as-imagevideo-to-video-model)
- [Data Collection \& Pipeline](#data-collection--pipeline)
- [Training Details](#training-details)
- [Limitation and Future Work](#limitation-and-future-work)
In Open-Sora 1.1 release, we train a 700M models on 10M data (Open-Sora 1.0 trained on 400K data) with a better STDiT architecture. We implement the following features mentioned in [sora's report](https://openai.com/research/video-generation-models-as-world-simulators):
- Variable durations, resolutions, aspect ratios (Sampling flexibility, Improved framing and composition)
- Prompting with images and videos (Animating images, Extending generated videos, Video-to-video editing, Connecting videos)
- Image generation capabilities
To achieve this goal, we use multi-task learning in the pretraining stage. For diffusion models, training with different sampled timestep is already a multi-task learning. We further extend this idea to multi-resolution, aspect ratio, frame length, fps, and different mask strategies for image and video conditioned generation. We train the model on **0s~15s, 144p to 720p, various aspect ratios** videos. Although the quality of time consistency is not that high due to limit training FLOPs, we can still see the potential of the model.
## Model Architecture Modification
We made the following modifications to the original ST-DiT for better training stability and performance (ST-DiT-2):
- **[Rope embedding](https://arxiv.org/abs/2104.09864) for temporal attention**: Following LLM's best practice, we change the sinusoidal positional encoding to rope embedding for temporal attention since it is also a sequence prediction task.
- **AdaIN and Layernorm for temporal attention**: we wrap the temporal attention with AdaIN and layernorm as the spatial attention to stabilize the training.
- **[QK-normalization](https://arxiv.org/abs/2302.05442) with [RMSNorm](https://arxiv.org/abs/1910.07467)**: Following [SD3](https://arxiv.org/pdf/2403.03206.pdf), we apply QK-normalization to the all attention for better training stability in half-precision.
- **Dynamic input size support and video infomation condition**: To support multi-resolution, aspect ratio, and fps training, we make ST-DiT-2 to accept any input size, and automatically scale positional embeddings. Extending [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)'s idea, we conditioned on video's height, width, aspect ratio, frame length, and fps.
- **Extending T5 tokens from 120 to 200**: our caption is usually less than 200 tokens, and we find the model can handle longer text well.
## Support for Multi-time/resolution/aspect ratio/fps Training
As mentioned in the [sora's report](https://openai.com/research/video-generation-models-as-world-simulators), training with original video's resolution, aspect ratio, and length increase sampling flexibility and improve framing and composition. We found three ways to achieve this goal:
- [NaViT](https://arxiv.org/abs/2307.06304): support dynamic size within the same batch by masking, with little efficiency loss. However, the system is a bit complex to implement, and may not benefit from optimized kernels such as flash attention.
- Padding ([FiT](https://arxiv.org/abs/2402.12376), [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)): support dynamic size within the same batch by padding. However, padding different resolutions to the same size is not efficient.
- Bucket ([SDXL](https://arxiv.org/abs/2307.01952), [PixArt](https://arxiv.org/abs/2310.00426)): support dynamic size in different batches by bucketing, but the size must be the same within the same batch, and only a fixed number of size can be applied. With the same size in a batch, we do not need to implement complex masking or padding.
For the simplicity of implementation, we choose the bucket method. We pre-define some fixed resolution, and allocate different samples to different bucket. The concern for bucketing is listed below. But we can see that the concern is not a big issue in our case.
<details>
<summary>View the concerns</summary>
- The bucket size is limited to a fixed number: First, in real-world applications, only a few aspect ratios (9:16, 3:4) and resolutions (240p, 1080p) are commonly used. Second, we find trained models can generalize well to unseen resolutions.
- The size in each batch is the same, breaks the i.i.d. assumption: Since we are using multiple GPUs, the local batches on different GPUs have different sizes. We did not see a significant performance drop due to this issue.
- The may not be enough samples to fill each bucket and the distribution may be biased: First, our dataset is large enough to fill each bucket when local batch size is not too large. Second, we should analyze the data's distribution on sizes and define the bucket size accordingly. Third, an unbalanced distribution did not affect the training process significantly.
- Different resolutions and frame lengths may have different processing speed: Different from PixArt, which only deals with aspect ratios of similar resolutions (similar token numbers), we need to consider the processing speed of different resolutions and frame lengths. We can use the `bucket_config` to define the batch size for each bucket to ensure the processing speed is similar.
</details>
![bucket](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_bucket.png)
As shown in the figure, a bucket is a triplet of `(resolution, num_frame, aspect_ratio)`. We provide pre-defined aspect ratios for different resolution that covers most of the common video aspect ratios. Before each epoch, we shuffle the dataset and allocate the samples to different buckets as shown in the figure. We put a sample into a bucket with largest resolution and frame length that is smaller than the video's.
Considering our computational resource is limited, we further introduce two attributes `keep_prob` and `batch_size` for each `(resolution, num_frame)` to reduce the computational cost and enable multi-stage training. Specifically, a high-resolution video will be downsampled to a lower resolution with probability `1-keep_prob` and the batch size for each bucket is `batch_size`. In this way, we can control the number of samples in different buckets and balance the GPU load by search a good batch size for each bucket.
A detailed explanation of the bucket usage in training is available in [docs/config.md](/docs/config.md#training-bucket-configs).
## Masked DiT as Image/Video-to-Video Model
Transformers can be easily extended to support image-to-image and video-to-video tasks. We propose a mask strategy to support image and video conditioning. The mask strategy is shown in the figure below.
![mask strategy](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask.png)
Typically, we unmask the frames to be conditioned on for image/video-to-video condition. During the ST-DiT forward, unmasked frames will have timestep 0, while others remain the same (t). We find directly apply the strategy to trained model yield poor results as the diffusion model did not learn to handle different timesteps in one sample during training.
Inspired by [UL2](https://arxiv.org/abs/2205.05131), we introduce random mask strategy during training. Specifically, we randomly unmask the frames during training, including unmask the first frame, the first k frames, the last frame, the last k frames, the first and last k frames, random frames, etc. Based on Open-Sora 1.0, with 50% probability of applying masking, we see the model can learn to handle image conditioning (while 30% yields worse ability) for 10k steps, with a little text-to-video performance drop. Thus, for Open-Sora 1.1, we pretrain the model from scratch with masking strategy.
An illustration of masking strategy config to use in inference is given as follow. A five number tuple provides great flexibility in defining the mask strategy. By conditioning on generated frames, we can autogressively generate infinite frames (although error propagates).
![mask strategy config](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask_config.png)
A detailed explanation of the mask strategy usage is available in [docs/config.md](/docs/config.md#advanced-inference-config).
## Data Collection & Pipeline
As we found in Open-Sora 1.0, the data number and quality are crucial for training a good model, we work hard on scaling the dataset. First, we create an automatic pipeline following [SVD](https://arxiv.org/abs/2311.15127), inlcuding scene cutting, captioning, various scoring and filtering, and dataset management scripts and conventions. More infomation can be found in [docs/data_processing.md](/docs/data_processing.md).
![pipeline](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_data_pipeline.png)
We plan to use [panda-70M](https://snap-research.github.io/Panda-70M/) and other data to traing the model, which is approximately 30M+ data. However, we find disk IO a botteleneck for training and data processing at the same time. Thus, we can only prepare a 10M dataset and did not go through all processing pipeline that we built. Finally, we use a dataset with 9.7M videos + 2.6M images for pre-training, and 560k videos + 1.6M images for fine-tuning. The pretraining dataset statistics are shown below. More information about the dataset can be found in [docs/datasets.md](/docs/datasets.md).
Image text tokens (by T5 tokenizer):
![image text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_image_textlen.png)
Video text tokens (by T5 tokenizer). We directly use panda's short caption for training, and caption other datasets by ourselves. The generated caption is usually less than 200 tokens.
![video text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_textlen.png)
Video duration:
![video duration](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_duration.png)
## Training Details
With limited computational resources, we have to carefully monitor the training process, and change the training strategy if we speculate the model is not learning well since there is no computation for ablation study. Thus, Open-Sora 1.1's training includes multiple changes, and as a result, ema is not applied.
1. First, we fine-tune **6k** steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for **24k** steps, which takes **4 days** on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in **240p** resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py). The video looking is good, but the model does not know much about the temporal knowledge. We use mask ratio of 10%.
3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained **40k** steps for **2 days**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). We use a lower resolution as we find in Open-Sora 1.0 that the model can learn temporal knowledge with relatively low resolution.
4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. Similar to SD3, we find the model quickly adapt to the QK-normalization. We also switch iddpm-speed to iddpm, and increase the mask ratio to 25% as we find image-condition not learning well. We trained for **17k** steps for **14 hours**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step **81k**.
5. **[Stage 2]** We switch to a higher resolution, where most videos are in **240p and 480p** resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained **22k** steps for **one day** on all pre-training data.
6. **[Stage 3]** We switch to a higher resolution, where most videos are in **480p and 720p** resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained **4k** with **one day** on high-quality data. We find loading previous stage's optimizer state can help the model learn faster.
To summarize, the training of Open-Sora 1.1 requires approximately **9 days** on 64 H800 GPUs.
## Limitation and Future Work
As we get one step closer to the replication of Sora, we find many limitations for the current model, and these limitations point to the future work.
- **Generation Failure**: we find many cases (especially when the total token number is large or the content is complex), our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it. Besides, we will increase our model size and training data to improve the generation quality in the next version.
- **Noisy generation and influency**: we find the generated model is sometimes noisy and not fluent, especially for long videos. We think the problem is due to not using a temporal VAE. As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we plan to develop a temporal VAE for the model in the next version.
- **Lack of time consistency**: we find the model cannot generate videos with high time consistency. We think the problem is due to the lack of training FLOPs. We plan to collect more data and continue training the model to improve the time consistency.
- **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.
- **Low aesthetic score**: we find the model's aesthetic score is not high. The problem is due to the lack of aesthetic score filtering, which is not conducted due to IO bottleneck. We plan to filter the data by aesthetic score and finetuning the model to improve the aesthetic score.
- **Worse quality for longer video generation**: we find with a same prompt, the longer video has worse quality. This means the image quality is not equally adapted to different lengths of sequences.
> - **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou, Tianyi Li
> - **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu

160
docs/report_03.md Normal file
View File

@ -0,0 +1,160 @@
# Open-Sora 1.2 Report
- [Video compression network](#video-compression-network)
- [Rectified flow and model adaptation](#rectified-flow-and-model-adaptation)
- [More data and better multi-stage training](#more-data-and-better-multi-stage-training)
- [Easy and effective model conditioning](#easy-and-effective-model-conditioning)
- [Evaluation](#evaluation)
- [Sequence parallelism](#sequence-parallelism)
In Open-Sora 1.2 release, we train a 1.1B models on >30M data (\~80k hours), with training cost 35k H100 GPU hours, supporting 0s\~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
| | image | 2s | 4s | 8s | 16s |
| ---- | ----- | --- | --- | --- | --- |
| 240p | ✅ | ✅ | ✅ | ✅ | ✅ |
| 360p | ✅ | ✅ | ✅ | ✅ | ✅ |
| 480p | ✅ | ✅ | ✅ | ✅ | 🆗 |
| 720p | ✅ | ✅ | ✅ | 🆗 | 🆗 |
Here ✅ means that the data is seen during training, and 🆗 means although not trained, the model can inference at that config. Inference for 🆗 requires more than one 80G memory GPU and sequence parallelism.
Besides features introduced in Open-Sora 1.1, Open-Sora 1.2 highlights:
- Video compression network
- Rectifie-flow training
- More data and better multi-stage training
- Easy and effective model conditioning
- Better evaluation metrics
All implementations (both training and inference) of the above improvements are available in the Open-Sora 1.2 release. The following sections will introduce the details of the improvements. We also refine our codebase and documentation to make it easier to use and develop, and add a LLM to [refine input prompts](/README.md#gpt-4o-prompt-refinement) and support more languages.
## Video compression network
For Open-Sora 1.0 & 1.1, we used stability-ai's 83M 2D VAE, which compress the video only in the spatial dimension by 8x8 times. To reduce the temporal dimension, we extracted one frame in every three frames. However, this method led to the low fluency of generated video as the generated fps is sacrificed. Thus, in this release, we introduce the video compression network as OpenAI's Sora does. With a 4 times compression in the temporal dimension, we do not need to extract frames and can generate videos with the original fps.
Considering the high computational cost of training a 3D VAE, we hope to re-use the knowledge learnt in the 2D VAE. We notice that after 2D VAE's compression, the features adjacent in the temporal dimension are still highly correlated. Thus, we propose a simple video compression network, which first compress the video in the spatial dimension by 8x8 times, then compress the video in the temporal dimension by 4x times. The network is shown below:
![video_compression_network](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_3d_vae.png)
We initialize the 2D VAE with [SDXL's VAE](https://huggingface.co/stabilityai/sdxl-vae), which is better than our previously used one. For the 3D VAE, we adopt the structure of VAE in [Magvit-v2](https://magvit.cs.cmu.edu/v2/), which contains 300M parameters. Along with 83M 2D VAE, the total parameters of the video compression network is 384M. We train the 3D VAE for 1.2M steps with local batch size 1. The training data is videos from pixels and pixabay, and the training video size is mainly 17 frames, 256x256 resolution. Causal convolutions are used in the 3D VAE to make the image reconstruction more accurate.
Our training involves three stages:
1. For the first 380k steps, we train on 8 GPUs and freeze the 2D VAE. The training objective includes the reconstruction of the compressed features from 2D VAE (pink one in the figure) and also add a loss to make features from the 3D VAE similar to the features from the 2D VAE (pink one and green one, called identity loss). We find the latter loss can quickly make the whole VAE achieve a good performance for image and much faster to converge in the next stage.
2. For the next 260k steps, We remove the identity loss and just learn the 3D VAE.
3. For the last 540k steps , since we find only reconstruction 2D VAE's feature cannot lead to further improvement, we remove the loss and train the whole VAE to reconstruct the original videos. This stage is trained on on 24 GPUs.
For both stage 1 and stage 2 training, we adopt 20% images and 80% videos. Following [Magvit-v2](https://magvit.cs.cmu.edu/v2/), we train video using 17 frames, while zero-padding the first 16 frames for image. However, we find that this setting leads to blurring of videos with length different from 17 frames. Thus, in stage 3, we use a random number within 34 frames for mixed video length training (a.k.a., zero-pad the first `43-n` frames if we want to train a `n` frame video), to make our VAE more robust to different video lengths. Our [training](/scripts/train_vae.py) and [inference](/scripts/inference_vae.py) code is available in the Open-Sora 1.2 release.
When using the VAE for diffusion model, our stacked VAE requires small memory as the our VAE's input is already compressed. We also split the input videos input several 17 frames clips to make the inference more efficient. The performance of our VAE is on par with another open-sourced 3D VAE in [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md).
| Model | SSIM↑ | PSNR↑ |
| ------------------ | ----- | ------ |
| Open-Sora-Plan 1.1 | 0.882 | 29.890 |
| Open-Sora 1.2 | 0.880 | 30.590 |
## Rectified flow and model adaptation
Lastest diffusion model like Stable Diffusion 3 adopts the [rectified flow](https://github.com/gnobitab/RectifiedFlow) instead of DDPM for better performance. Pitiably, SD3's rectified flow training code is not open-sourced. However, Open-Sora 1.2 provides the training code following SD3's paper, including:
- Basic rectified flow training ([original rectified flow paper](https://arxiv.org/abs/2209.03003))
- Logit-norm sampling for training acceleration ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 3.1, intuitively it is more likely to sample timesteps at middle noise level)
- Resolution and video length aware timestep sampling ([SD3 paper](https://arxiv.org/pdf/2403.03206) Section 5.3.2, intuitively it is more likely to sample timesteps with more noise for larger resolution, and we extend it to longer video)
For the resolution-aware timestep sampling, we should use more noise for images with larger resolution. We extend this idea to video generation and use more noise for videos with longer length.
Open-Sora 1.2 starts from the [PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) checkpoint. Note that this model is trained with DDPM and SDXL VAE, also a much higher resolution. We find finetuning on a small dataset can easily adapt the model for our video generation setting. The adaptation process is as follows, all training is done on 8 GPUs (the adaptation for the diffusion model is quite fast and straightforward):
1. Multi-resolution image generation ability: we train the model to generate different resolution ranging from 144p to 2K for 20k steps.
2. QK-norm: we add the QK-norm to the model and train for 18k steps.
3. Rectified flow: we transform from discrete-time DDPM to continuous-time rectified flow and train for 10k steps.
4. Rectified flow with logit-norm sampling and resolution-aware timestep sampling: we train for 33k steps.
5. Smaller AdamW epsilon: following SD3, with QK-norm, we can use a smaller epsilon (1e-15) for AdamW, we train for 8k steps.
6. New VAE and fps conditioning: we replace the original VAE with ours and add fps conditioning to the timestep conditioning, we train for 25k steps. Note that normalizing each channel is important for rectified flow training.
7. Temporal attention blocks: we add temporal attention blocks with zero initialized projection layers. We train on images for 3k steps.
8. Temporal blocks only for video with mask strategy: we train the temporal attention blocks only on videos for 38k steps.
After the above adaptation, we are ready to train the model on videos. The adaptation above maintains the original model's ability to generate high-quality images, and brings multiple benefits for video generation:
- With rectified flow, we can accelerate the training and reduce the number of sampling steps for video from 100 to 30, which greatly reduces the waiting time for inference.
- With qk-norm, the training is more stablized and an aggressive optimizer can be used.
- With new VAE, the temporal dimension is compressed by 4 times, which makes the training more efficient.
- With multi-resolution image generation ability, the model can generate videos with different resolutions.
## More data and better multi-stage training
Due to a limited computational budget, we carefully arrange the training data from low to high quality and split our training into three stages. Our training involves 12x8 GPUs, and the total training time is about 2 weeks for about 70k steps.
### First stage
We first train the model on Webvid-10M datasets (40k hours) for 30k steps (2 epochs). Since the video is all lower than 360p resolution and contains watermark, we train on this dataset first. The training mainly happens on 240p and 360p, with video length 2s~16s. We use the original caption in the dataset for training. The training config locates in [stage1.py](/configs/opensora-v1-2/train/stage1.py).
### Second stage
Then we train the model on Panda-70M datasets. This dataset is large but the quality varies. We use the official 30M subset which clips are more diverse, and filter out videos with aesthetic score lower than 4.5. This leads to a 20M subset with 41k hours. The captions in the dataset are directly used for our training. The training config locates in [stage2.py](/configs/opensora-v1-2/train/stage2.py).
The training mainly happens on 360p and 480p. We train the model for 23k steps, which is 0.5 epoch. The training is not fully done since we hope our new model can meet you earlier.
### Third stage
In this stage, we collect ~2M video clips with a total length of 5K hours from all kinds of sources, including:
- Free-license videos, sourced from Pexels, Pixabay, Mixkit, etc.
- [MiraData](https://github.com/mira-space/MiraData): a high-quality dataset with long videos, mainly from games and city/scenic exploration.
- [Vript](https://github.com/mutonix/Vript/tree/main): a densely annotated dataset.
- And some other datasets.
While MiraData and Vript have captions from GPT, we use [PLLaVA](https://github.com/magic-research/PLLaVA) to caption the rest ones. Compared with LLaVA, which is only capable of single frame/image captioning, PLLaVA is specially designed and trained for video captioning. The [accelerated PLLaVA](/tools/caption/README.md#pllava-captioning) is released in our `tools/`. In practice, we use the pretrained PLLaVA 13B model and select 4 frames from each video for captioning with a spatial pooling shape of 2*2.
Some statistics of the video data used in this stage are shown below. We present basic statistics of duration and resolution, as well as aesthetic score and optical flow score distribution.
We also extract tags for objects and actions from video captions and count their frequencies.
![stats](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_video_stats.png)
![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_objects_count.png)
![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_actions_count.png)
We mainly train 720p and 1080p videos in this stage, aiming to extend the model's ability to larger resolutions. We use a mask ratio of 25% during training. The training config locates in [stage3.py](/configs/opensora-v1-2/train/stage3.py). We train the model for 15k steps, which is approximately 2 epochs.
## Easy and effective model conditioning
For stage 3, we calculate the aesthetic score and motion score for each video clip. However, since the number of video clips is small, we are not willing to filter out clips with low scores, which leads to a smaller dataset. Instead, we append the scores to the captions and use them as conditioning. We find this method can make model aware of the scores and follows the scores to generate videos with better quality.
For example, a video with aesthetic score 5.5, motion score 10, and a detected camera motion pan left, the caption will be:
```plaintext
[Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left.
```
During inference, we can also use the scores to condition the model. For camera motion, we only label 13k clips with high confidence, and the camera motion detection module is released in our tools.
## Evaluation
Previously, we monitor the training process only by human evaluation, as DDPM traning loss is not well correlated with the quality of generated videos. However, for rectified flow, we find the training loss is well correlated with the quality of generated videos as stated in SD3. Thus, we keep track of rectified flow evaluation loss on 100 images and 1k videos.
We sampled 1k videos from pixabay as validation dataset. We calculate the evaluation loss for image and different lengths of videos (2s, 4s, 8s, 16s) for different resolution (144p, 240p, 360p, 480p, 720p). For each setting, we equidistantly sample 10 timesteps. Then all the losses are averaged. We also provide a [video](https://streamable.com/oqkkf1) showing the sampled videos with a fixed prompt for different steps.
![Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_val_loss.png)
![Video Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vid_val_loss.png)
In addition, we also keep track of [VBench](https://vchitect.github.io/VBench-project/) scores during training. VBench is an automatic video evaluation benchmark for short video generation. We calcuate the vbench score with 240p 2s videos. The two metrics verify that our model continues to improve during training.
![VBench](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vbench_score.png)
All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details.
| Model | Total Score | Quality Score | Semantic Score |
| -------------- | ----------- | ------------- | -------------- |
| Open-Sora V1.0 | 75.91% | 78.81% | 64.28% |
| Open-Sora V1.2 | 79.23% | 80.71% | 73.30% |
## Sequence parallelism
We use sequence parallelism to support long-sequence training and inference. Our implementation is based on Ulysses and the workflow is shown below. When sequence parallelism is enabled, we only need to apply the `all-to-all` communication to the spatial block in STDiT as only spatial computation is dependent on the sequence dimension.
![SP](..https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/sequence_parallelism.jpeg)
Currently, we have not used sequence parallelism for training as data resolution is small and we plan to do so in the next release. As for inference, we can use sequence parallelism in case your GPU goes out of memory. A simple benchmark shows that sequence parallelism can achieve speedup
| Resolution | Seconds | Number of GPUs | Enable SP | Time taken/s | Speedup per GPU |
| ---------- | ------- | -------------- | --------- | ------------ | --------------- |
| 720p | 16s | 1 | No | 547.97 | - |
| 720p | 16s | 2 | Yes | 244.38 | 12% |

116
docs/report_04.md Normal file
View File

@ -0,0 +1,116 @@
# Open-Sora 1.3 Report
- [Video compression network](#video-compression-network)
- [Upgraded STDiT with shifted-window attention](#upgraded-stdit-with-shifted-window-attention)
- [Easy and effective model conditioning](#easy-and-effective-model-conditioning)
- [Evaluation](#evaluation)
In Open-Sora 1.3 release, we train a 1.1B models on >60M data (\~85k hours), with training cost 35k H100 GPU hours, supporting 0s\~113 frames, 360p & 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.2 version, Open-Sora 1.3 can also do image-to-video generation and video extension.
| | image | 49 frames | 65 frames | 81 frames | 97 frames | 113 frames |
| ---- | ----- | ---------- | ---------- | ---------- | --------- | ---------- |
| 360p | ✅ | ✅ | ✅ | ✅ | ✅ |✅ |
| 720p | ✅ | ✅ | ✅ | ✅ | ✅ |✅ |
Here ✅ means that the data is seen during training.
Besides features introduced in Open-Sora 1.2, Open-Sora 1.3 highlights:
- Video compression network
- Upgraded STDiT with shifted-window attention
- More data and better multi-stage training
- Easy and effective model conditioning
- Better evaluation metrics
All implementations (both training and inference) of the above improvements are available in the Open-Sora 1.3 release. The following sections will introduce the details of the improvements. We also refine our codebase and documentation to make it easier to use and develop, and add a LLM refiner to [refine input prompts](/README.md#gpt-4o-prompt-refinement) and support more languages.
## Video compression network
In Open-Sora 1.2, the video compression architecture employed a modular approach, where spatial and temporal dimensions were handled separately. The spatial VAE, based on Stability AI's SDXL VAE, compressed individual frames along the spatial dimensions. The temporal VAE then processed the latent representations from the spatial VAE to handle temporal compression. This two-stage design allowed effective spatial and temporal compression but introduced limitations. These included inefficiencies in handling long videos due to fixed-length input frames, a lack of seamless integration between spatial and temporal features, and higher memory requirements during both training and inference.
Open-Sora 1.3 introduces a unified approach to video compression. By combining spatial and temporal processing into a single framework and leveraging advanced features like tiled 3D convolutions and dynamic frame support, Open-Sora 1.3 achieves improved better efficiency, scalability, and reconstruction quality. Here are the key improvements in Open-Sora 1.3 VAE:
**1. Unified Spatial-Temporal Processing:** Instead of using separate VAEs for spatial and temporal compression, Open-Sora 1.3 adopts a single encoder-decoder structure that simultaneously handles both dimensions. This approach eliminates the need for intermediate representations and redundant data transfers between spatial and temporal modules.
**2. Tiled 3D Convolutions:** Open-Sora 1.3 incorporates tiled 3D convolution support for the temporal dimension. By breaking down videos into smaller temporal tiles, this feature enables efficient encoding and decoding of longer video sequences without increasing memory overhead. This improvement addresses the limitations of Open-Sora 1.2 in handling large frame counts and ensures higher flexibility in temporal compression.
**3. Dynamic Micro-Batch and Micro-Frame Processing:** Open-Sora 1.3 introduces a new micro-batch and micro-frame processing mechanism. This allows for: (1) Adaptive temporal overlap: Overlapping frames during temporal encoding and decoding help reduce discontinuities at tile boundaries. (2) Dynamic frame size support: Instead of being restricted to fixed-length sequences (e.g., 17 frames in Open-Sora 1.2), Open-Sora 1.3 supports dynamic sequence lengths, making it robust for varied video lengths.
**4. Unified Normalization Mechanism:** The normalization process in Open-Sora 1.3 has been refined with tunable scaling (scale) and shifting (shift) parameters that ensure consistent latent space distributions across diverse datasets. Unlike Open-Sora 1.2, where normalization was specific to fixed datasets, this version introduces more generalized parameters and support for frame-specific normalization strategies.
#### Summary of Improvements
| Feature | Open-Sora 1.2 | Open-Sora 1.3 |
|------------------------|-----------------------------------------|-----------------------------------------|
| **Architecture** | Separate spatial and temporal VAEs | Unified spatial-temporal VAE |
| **Tiled Processing** | Not supported | Supported (Tiled 3D Convolutions) |
| **Frame Length Support**| Fixed (17 frames) | Dynamic frame support with overlap |
| **Normalization** | Fixed parameters | Tunable scaling and shifting |
## Upgraded STDiT with shifted-window attention
Following the success of OpenSora 1.2, version 1.3 introduces several architectural improvements and new capabilities to enhance video generation quality and flexibility. This section outlines the key improvements and differences between these two versions.
Latest diffusion models like Stable Diffusion 3 adopt the [rectified flow](https://github.com/gnobitab/RectifiedFlow) instead of DDPM for better performance. While SD3's rectified flow training code is not open-sourced, OpenSora provides the training code following SD3's paper. OpenSora 1.2 introduced several key strategies from SD3:
1. Basic rectified flow training, which enables continuous-time diffusion
2. Logit-norm sampling for training acceleration (following SD3 paper Section 3.1), preferentially sampling timesteps at middle noise levels
3. Resolution and video length aware timestep sampling (following SD3 paper Section 5.3.2), using more noise for larger resolutions and longer videos
For OpenSora 1.3, we further enhance the model with significant improvements in architecture, capabilities, and performance:
#### 1. Shift-Window Attention Mechanism
- Introduced kernel-based local attention with configurable kernel_size for efficient computation
- Implemented shift-window partitioning strategy similar to Swin Transformer
- Added padding mask handling for window boundaries with extra_pad_on_dims support
- Extended position encoding with 3D relative positions within local windows (temporal, height, width)
#### 2. Enhanced Position Encoding
- Improved RoPE implementation with reduced rotation_dim (1/3 of original) for 3D scenarios
- Added separate rotary embeddings for temporal, height, and width dimensions
- Implemented resolution-adaptive scaling for position encodings
- Optional spatial RoPE for better spatial relationship modeling
#### 3. Flexible Generation
- Added I2V and V2V capabilities with dedicated conditioning mechanisms
- Introduced conditional embedding modules (x_embedder_cond and x_embedder_cond_mask)
- Zero-initialized condition embeddings for stable training
- Flexible temporal modeling with skip_temporal option
#### 4. Performance Optimization
- Refined Flash Attention triggering conditions (N > 128) for better efficiency
- Added support for torch.scaled_dot_product_attention (SDPA) as an alternative backend
- Optimized memory usage through improved padding and window partitioning
- Enhanced sequence parallelism with adaptive height padding
The adaptation process from [PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) remains similar but with additional steps:
1-7. [Same as v1.2: multi-resolution training, QK-norm, rectified flow, logit-norm sampling, smaller AdamW epsilon, new VAE, and basic temporal attention]
#### 8. Enhanced temporal blocks
- Added kernel-based local attention with shift-window support
- Implemented 3D relative position encoding with resolution-adaptive scaling
- Zero-initialized projection layers with improved initialization strategy
Compared to v1.2 which focused on basic video generation, v1.3 brings substantial improvements in three key areas: **1. Quality**: Enhanced spatial-temporal modeling through shift-window attention and 3D position encoding. **2. Flexibility**: Support for I2V/V2V tasks and configurable temporal modeling. **3. Efficiency**: Optimized attention computation and memory usage
These improvements maintain backward compatibility with v1.2's core features while extending the model's capabilities for real-world applications. The model retains its ability to generate high-quality images and videos using rectified flow, while gaining new strengths in conditional generation and long sequence modeling.
## Easy and effective model conditioning
We calculate the aesthetic score and motion score for each video clip, and filter out those clips with low scores, which leads to a dataset with better video quality. Additionally, we append the scores to the captions and use them as conditioning. Specifically, we convert numerical scores into descriptive language based on predefined ranges. The aesthetic score transformation function converts numerical aesthetic scores into descriptive labels based on predefined ranges: scores below 4 are labeled "terrible," progressing through "very poor," "poor," "fair," "good," and "very good," with scores of 6.5 or higher labeled as "excellent." Similarly, the motion score transformation function maps motion intensity scores to descriptors: scores below 0.5 are labeled "very low," progressing through "low," "fair," "high," and "very high," with scores of 20 or more labeled as "extremely high." We find this method can make model aware of the scores and follows the scores to generate videos with better quality.
For example, a video with aesthetic score 5.5, motion score 10, and a detected camera motion pan left, the caption will be:
```plaintext
[Original Caption] The aesthetic score is good, the motion strength is high, camera motion: pan left.
```
During inference, we can also use the scores to condition the model. For camera motion, we only label 13k clips with high confidence, and the camera motion detection module is released in our tools.
## Evaluation
Previously, we monitor the training process only by human evaluation, as DDPM traning loss is not well correlated with the quality of generated videos. However, for rectified flow, we find the training loss is well correlated with the quality of generated videos as stated in SD3. Thus, we keep track of rectified flow evaluation loss on 100 images and 1k videos.
We sampled 1k videos from pixabay as validation dataset. We calculate the evaluation loss for image and different lengths of videos (49 frames, 65 frames, 81 frames, 97 frames, 113 frames) for different resolution (360p, 720p). For each setting, we equidistantly sample 10 timesteps. Then all the losses are averaged.
In addition, we also keep track of [VBench](https://vchitect.github.io/VBench-project/) scores during training. VBench is an automatic video evaluation benchmark for short video generation. We calcuate the vbench score with 360p 49-frame videos. The two metrics verify that our model continues to improve during training.
All the evaluation code is released in `eval` folder. Check the [README](/eval/README.md) for more details.

201
docs/train.md Normal file
View File

@ -0,0 +1,201 @@
# Step by step to train or finetune your own model
## Installation
Besides from the installation in the main page, you need to install the following packages:
```bash
pip install git+https://github.com/hpcaitech/TensorNVMe.git # requires cmake, for checkpoint saving
pip install pandarallel # for parallel processing
```
## Prepare dataset
The dataset should be presented in a `csv` or `parquet` file. To better illustrate the process, we will use a 45k [pexels dataset](https://huggingface.co/datasets/hpcai-tech/open-sora-pexels-45k) as an example. This dataset contains clipped, score filtered high-quality videos from [Pexels](https://www.pexels.com/).
First, download the dataset to your local machine:
```bash
mkdir datasets
cd datasets
# For Chinese users, export HF_ENDPOINT=https://hf-mirror.com to speed up the download
huggingface-cli download --repo-type dataset hpcai-tech/open-sora-pexels-45k --local-dir open-sora-pexels-45k # 250GB
cd open-sora-pexels-45k
cat tar/pexels_45k.tar.* > pexels_45k.tar
tar -xvf pexels_45k.tar
mv pexels_45k .. # make sure the path is Open-Sora/datasets/pexels_45k
```
There are three `csv` files provided:
- `pexels_45k.csv`: contains only path and text, which needs to be processed for training.
- `pexels_45k_necessary.csv`: contains necessary information for training.
- `pexels_45k_score.csv`: contains score information for each video. The 45k videos are filtered out based on the score. See tech report for more details.
If you want to use custom dataset, at least the following columns are required:
```csv
path,text,num_frames,height,width,aspect_ratio,resolution,fps
```
We provide a script to process the `pexels_45k.csv` to `pexels_45k_necessary.csv`:
```bash
# single process
python scripts/cnv/meta.py --input datasets/pexels_45k.csv --output datasets/pexels_45k_nec.csv --num_workers 0
# parallel process
python scripts/cnv/meta.py --input datasets/pexels_45k.csv --output datasets/pexels_45k_nec.csv --num_workers 64
```
> The process may take a while, depending on the number of videos in the dataset. The process is neccessary for training on arbitrary aspect ratio, resolution, and number of frames.
## Training
The command format to launch training is as follows:
```bash
torchrun --nproc_per_node 8 scripts/diffusion/train.py [path/to/config] --dataset.data-path [path/to/dataset] [override options]
```
For example, to train a model with stage 1 config from scratch using pexels dataset:
```bash
torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv
```
### Config
All configs are located in `configs/diffusion/train/`. The following rules are applied:
- `_base_ = ["config_to_inherit"]`: inherit from another config by mmengine's support. Variables are overwritten by the new config. Dictionary is merged if `_delete_` key is not present.
- command line arguments override the config file. For example, `--lr 1e-5` will override the `lr` in the config file. `--dataset.data-path datasets/pexels_45k_necessary.csv` will override the `data-path` value in the dictionary `dataset`.
The `bucket_config` is used to control different training stages. It is a dictionary of dictionaries. The tuple means (sampling probability, batch size). For example:
```python
bucket_config = {
"256px": {
1: (1.0, 45), # for 256px images, use 100% of the data with batch size 45
33: (1.0, 12), # for 256px videos with no less than 33 frames, use 100% of the data with batch size 12
65: (1.0, 6), # for 256px videos with no less than 65 frames, use 100% of the data with batch size 6
97: (1.0, 4), # for 256px videos with no less than 97 frames, use 100% of the data with batch size 4
129: (1.0, 3), # for 256px videos with no less than 129 frames, use 100% of the data with batch size 3
},
"768px": {
1: (0.5, 13), # for 768px images, use 50% of the data with batch size 13
},
"1024px": {
1: (0.5, 7), # for 1024px images, use 50% of the data with batch size 7
},
}
```
We provide the following configs, the batch size is searched on H200 GPUs with 140GB memory:
- `image.py`: train on images only.
- `stage1.py`: train on videos with 256px resolution.
- `stage2.py`: train on videos with 768px resolution with sequence parallelism (default 4).
- `stage1_i2v.py`: train t2v and i2v with 256px resolution.
- `stage2_i2v.py`: train t2v and i2v with 768px resolution.
We also provide a demo config `demo.py` with small batch size for debugging.
### Fine-tuning
To finetune from Open-Sora v2, run:
```bash
torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --model.from_pretrained ckpts/Open_Sora_v2.safetensors
```
To finetune from flux-dev, we provided a transformed flux-dev [ckpts](https://huggingface.co/hpcai-tech/flux1-dev-fused-rope). Download it to `ckpts` and run:
```bash
torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --model.from_pretrained ckpts/flux1-dev-fused-rope.safetensors
```
### Multi-GPU
To train on multiple GPUs, use `colossalai run`:
```bash
colossalai run --hostfile hostfiles --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --model.from_pretrained ckpts/Open_Sora_v2.safetensors
```
`hostfiles` is a file that contains the IP addresses of the nodes. For example:
```bash
xxx.xxx.xxx.xxx
yyy.yyy.yyy.yyy
zzz.zzz.zzz.zzz
```
use `--wandb True` to log the training process to [wandb](https://wandb.ai/).
### Resume training
To resume training, use `--load`. It will load the optimizer state and dataloader state.
```bash
torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --load outputs/your_experiment/epoch*-global_step*
```
If you want to load optimzer state but not dataloader state, use:
```bash
torchrun --nproc_per_node 8 scripts/diffusion/train.py configs/diffusion/train/stage1.py --dataset.data-path datasets/pexels_45k_necessary.csv --load outputs/your_experiment/epoch*-global_step* --start-step 0 --start-epoch 0
```
> Note if dataset, batch size, and number of GPUs are changed, the dataloader state will not be meaningful.
## Inference
The inference is the same as described in the main page. The command format is as follows:
```bash
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --model.from_pretrained outputs/your_experiment/epoch*-global_step*
```
## Advanced Usage
More details are provided in the tech report. If explanation for some techiques is needed, feel free to open an issue.
- Tensor parallelism and sequence parallelism
- Zero 2
- Pin memory organization
- Garbage collection organization
- Data prefetching
- Communication bucket optimization
- Shardformer for T5
### Gradient Checkpointing
We support selective gradient checkpointing to save memory. The `grad_ckpt_setting` is a tuple, the first element is the number of dual layers to apply gradient checkpointing, the second element is the number of single layers to apply full gradient. A very large number will apply full gradient to all layers.
```python
grad_ckpt_setting = (100, 100)
model = dict(
grad_ckpt_setting=grad_ckpt_setting,
)
```
To further save memory, you can offload gradient checkpointing to CPU by:
```python
grad_ckpt_buffer_size = 25 * 1024**3 # 25GB
```
### Asynchronous Checkpoint Saving
With `--async-io True`, the checkpoint will be saved asynchronously with the support of ColossalAI. This will save time for checkpoint saving.
### Dataset
With a very large dataset, the `csv` file or even `parquet` file may be too large to fit in memory. We provide a script to split the dataset into smaller chunks:
```bash
python scripts/cnv/shard.py /path/to/dataset.parquet
```
Then a folder with shards will be created. You can use the `--dataset.memory_efficient True` to load the dataset shard by shard.

49
docs/zh_CN/report_v1.md Normal file
View File

@ -0,0 +1,49 @@
# Open-Sora v1 技术报告
OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。
## 选择高效的架构
为了降低计算成本我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此我们决定在我们第一个版本中使用2D VAE来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original))。
视频训练涉及大量的token。考虑到24fps的1分钟视频我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此我们使用时空注意力来降低成本这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。
如图中所示在STDiTST代表时空我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好但我们在16x256x256视频上的实验表明相同数量的迭代次数下性能排名为DiT完整> STDiT顺序> STDiT并行≈ Latte。因此我们出于效率考虑选择了STDiT顺序。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。
![Architecture Comparison](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch_comp.png)
为了专注于视频生成我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。
![Architecture](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_arch.jpg)
借鉴PixArt-α和Stable Video Diffusion的成功我们还采用了渐进式训练策略在366K预训练数据集上进行16x256x256的训练然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入这一策略极大地降低了计算成本。
我们还尝试在DiT中使用3D patch嵌入器。然而在时间维度上2倍下采样后生成的视频质量较低。因此我们将在下一版本中将下采样留给时间VAE。目前我们在每3帧采样一次进行16帧训练以及在每2帧采样一次进行64帧训练。
## 数据是训练高质量模型的核心
我们发现数据的数量和质量对生成视频的质量有很大的影响,甚至比模型架构和训练策略的影响还要大。目前,我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割366K个视频片段。这些视频的质量参差不齐而且字幕也不够准确。因此我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA一个图像字幕模型通过三个帧和一个设计好的提示来标记视频。有了设计好的提示LLaVA能够生成高质量的字幕。
![Caption](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_caption.png)
由于我们更加注重数据质量,我们准备收集更多数据,并在下一版本中构建一个视频预处理流程。
## 训练细节
在有限的训练预算下我们只进行了一些探索。我们发现学习率1e-4过大因此将其降低到2e-5。在进行大批量训练时我们发现`fp16`比`bf16`不太稳定可能会导致生成失败。因此我们在64x512x512的训练中切换到`bf16`。对于其他超参数,我们遵循了之前的研究工作。
## 损失曲线
16x256x256 预训练损失曲线
![16x256x256 Pretraining Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_1.png)
16x256x256 高质量训练损失曲线
![16x256x256 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_2.png)
16x512x512 高质量训练损失曲线
![16x512x512 HQ Training Loss Curve](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_loss_curve_3.png)

114
docs/zh_CN/report_v2.md Normal file
View File

@ -0,0 +1,114 @@
# Open-Sora 1.1 技术报告
- [模型架构修改](#模型架构修改)
- [支持不同视频长度/分辨率/宽高比/帧率fps训练](#支持不同视频长度分辨率宽高比帧率fps训练)
- [使用Masked DiT作为图生视频/视频生视频模型](#使用masked-dit作为图生视频视频生视频模型)
- [数据收集和流程](#数据收集和流程)
- [训练详情](#训练详情)
- [结果和评价](#结果和评价)
- [不足和下一步计划](#不足和下一步计划)
在Open-Sora1.1版本中我们使用了10M数据来训练经过结构调优后的STDiT的700M模型Open-Sora1.0版本仅用400K数据。我们实现了[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的以下功能:
- 可变的视频时长、分辨率、宽高比(包括采样灵活性、改进的取景范围和构图)
- 提示词增加图片和视频选项(使图像动起来、生成式增长视频、视频到视频编辑、连接不同视频)
- 图像生成功能
为了实现这一目标我们在预训练阶段使用了多任务学习。对于扩散模型来说用不同的采样时间步长进行训练已经是一种多任务学习。我们将这一思想在图像和视频的条件生成模型上进一步扩展到多分辨率、宽高比、帧长、fps以及不同的掩码策略。我们在**0~15s、144p到720p、各种宽高比的视频**上训练模型。虽然由于训练FLOPs不足的限制生成的视频在时间一致性上的表现没有那么高但我们仍然可以看到这个模型的巨大潜力。
## 模型架构修改
我们对原始ST-DiT模型进行了以下修改以获得更好的训练稳定性和模型性能ST-DiT-2
- **在时间注意力模块中添加[旋转位置编码](https://arxiv.org/abs/2104.09864)**遵循目前LLM的最佳实践我们将时间注意力模块中的正弦位置编码更改为旋转位置编码因为它也算一项序列预测任务。
- **在时间注意力模块中添加AdaIN和Layernormal**我们将时间注意力与AdaIN和Layer范数作为空间注意力包裹起来以稳定训练。
- **[QK归一化](https://arxiv.org/abs/2302.05442)与[RMSNorm](https://arxiv.org/abs/1910.07467)**:和[SD3](https://arxiv.org/pdf/2403.03206.pdf)类似地我们应用QK归一化来提高半精度训练的稳定性。
- **支持动态输入大小和视频条件限定**为了支持多分辨率、宽高比和fps训练我们ST-DiT-2来接受任何输入大小。延申[PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha)的想法我们支持限定视频的高度、宽度、宽高比、帧长和fps。
- **将T5token数量从120扩展到200**我们使用的视频描述通常少于200个token我们发现模型也可以很好地处理更长的文本。
## 支持不同视频长度/分辨率/宽高比/帧率fps训练
正如[Sora报告](https://openai.com/research/video-generation-models-as-world-simulators)中提到的,使用原始无损视频的分辨率、宽高比和视频长度进行训练可以增加采样灵活性,改善取景和构图。我们找到了三种实现这一目标的方法:
- [NaViT](https://arxiv.org/abs/2307.06304)通过不同掩码策略支持在同一训练批次内使用不同大小的数据并且训练效率下降很少。然而该系统实现起来有点复杂并且可能无法兼容kernal优化技术如flashattention
- 填充([FiT](https://arxiv.org/abs/2402.12376)[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)):通过填充支持同一批次内的不同大小的数据。然而,将不同的分辨率填充到相同的大小会导致效率降低。
- 分桶训练([SDXL](https://arxiv.org/abs/2307.01952)、[PixArt](https://arxiv.org/abs/2310.00426)):支持通过分桶的方式在不同批次中动态调整大小,但在同一批次内数据大小必须相同,只能应用固定数量的数据大小。在一个批次中,我们不需要实现复杂的掩码或填充。
为了更便捷的实现,我们选择分桶训练的方式。我们预先定义了一些固定的分辨率,并将不同的样本分配到不同的桶中。下面列出了分桶方案中值得注意的点。但我们可以看到,这些在我们的实验中并不是一个大问题。
<details>
<summary>查看注意事项</summary>
- 桶大小被限制为固定数量首先在实际应用中通常只使用少数宽高比9:16、3:4和分辨率240p、1080p。其次我们发现经过训练的模型可以很好地推广到未见过的解决方案。
- 每批的大小相同打破了独立同分布i.i.d.)假设:由于我们使用多个 GPU因此不同 GPU 上的本地批次具有不同的大小。我们没有发现此问题导致性能显着下降。
- 可能没有足够的样本来填充每个桶,并且分布可能有偏差:首先,当本地批量大小不太大时,我们的数据集足够大以填充每个桶。其次,我们应该分析数据大小的分布并相应地定义桶大小。第三,分配不平衡并没有显着影响训练过程。
- 不同的分辨率和帧长可能有不同的处理速度与PixArt只处理相似分辨率相似token数的宽高比不同我们需要考虑不同分辨率和帧长的处理速度。我们可以使用“bucket_config”来定义每个桶的批量大小以确保处理速度相似。
</details>
![bucket](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_bucket.png)
如图所示桶是分辨率帧数量宽高比的三元组。我们为不同的分辨率提供预定义的宽高比涵盖了大多数常见的视频宽高比。在每个epoch之前我们打乱数据集并将样本分配到不同的桶中如图所示。我们将样本放入最大分辨率和帧长度小于视频的桶中。
考虑到我们的计算资源有限我们进一步为每个分辨率num_frame二元组引入keep_prob和batch_size两个属性以降低计算成本并实现多阶段训练。具体来说高清视频将以概率1-keep_prob下采样到较低分辨率的桶中并且每个桶的样本数量是由batch_size属性决定的。这样我们可以控制不同桶中的样本数量并通过为每个桶搜索合适的数据量来平衡GPU负载。
有关训练中桶使用的详细说明,请参阅[配置文件](/docs/config.md#training-bucket-configs).
## 使用Masked DiT作为图生视频/视频生视频模型
Transformer可以很容易地扩展到支持图生图和视频生视频的任务。我们提出了一种蒙版策略来支持图像和视频的调节。蒙版策略如下图所示。
![mask strategy](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask.png)
在将图像或视频转换成另一个视频的过程中我们通常会选择出需要作为条件的帧并取消其掩码unmask。在使用ST-DiT模型进行前向传播时被选择取消掩码unmask的帧将被赋予时间步长0而其他帧则保持它们原有的时间步长t。我们发现如果直接将这种策略应用到训练好的模型上会得到较差的结果因为扩散模型在训练过程中并未学会如何处理一个样本中具有不同时间步长的帧。
受[UL2](https://arxiv.org/abs/2205.05131)的启发我们在训练期间引入了随机掩码策略。具体来说我们在训练期间随机取消掩码帧包括取消掩码第一帧前k帧最后k帧最后k帧第一和最后k帧随机帧等。基于Open-Sora 1.0模型以50%的概率应用掩码策略我们发现模型能够在10,000步的训练中学会处理图像条件而30%的概率会导致处理能力变差同时文本到视频的性能略有下降。因此在Open-Sora 1.1版本中,我们从头开始预训练模型,并采用了掩码策略。
下图给出了用于推理的掩码策略配置的说明。五数字元组在定义掩码策略方面提供了极大的灵活性。
![mask strategy config](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_mask_config.png)
掩码策略用法的详细说明可在[配置文件](/docs/config.md#advanced-inference-config)中查看.
## 数据收集和流程
正如我们在Sora1.0版本中看见的那样,数据数量和质量对于训练一个好的模型至关重要,因此,我们努力扩展数据集。首先,我们创建了一个遵循[SVD](https://arxiv.org/abs/2311.15127)的自动流水线,包括场景切割、字幕、各种评分和过滤以及数据集管理脚本和通用惯例。
![pipeline](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_data_pipeline.png)
我们计划使用[panda-70M](https://snap-research.github.io/Panda-70M/)和其他数据来训练模型大约包含3000万条数据。然而我们发现磁盘输入输出disk IO在同时进行训练和数据处理时成为了一个瓶颈。因此我们只能准备一个包含1000万条数据的数据集并且没有完成我们构建的所有处理流程。最终我们使用了包含970万视频和260万图像的数据集进行预训练以及560,000视频和160万图像的数据集进行微调。预训练数据集的统计信息如下所示。
图像文本标记 (使用T5分词器)
![image text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_image_textlen.png)
视频文本标记 (使用T5分词器)。我们直接使用Panda的短视频描述进行训练并自己给其他数据集加视频描述。生成的字幕通常少于200个token。
![video text tokens](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_textlen.png)
视频时长:
![video duration](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_video_duration.png)
## 训练详情
由于计算资源有限我们必须仔细监控训练过程并在推测模型学习不佳时更改训练策略因为没有消融研究的计算。因此Open-Sora1.1版本的训练包括多个更改所以指数移动平均EMA未被应用。
1. 首先,我们从`Pixart-alpha-1024`的模型checkpoint开始使用不同分辨率的图像进行了6000步的微调。我们发现模型能够很容易地适应并生成不同分辨率的图像。为了加快扩散过程的训练我们使用了[SpeeDiT](https://github.com/1zeryu/SpeeDiT)iddpm-speed技术。
2. **[阶段一]** 然后我们使用梯度检查点gradient-checkpointing技术对模型进行了**24,000**步的预训练这个过程在64个H800 GPU上运行了**4天**。尽管模型看到的数据样本数量相同,我们发现与使用较小批量大小相比,模型的学习速度较慢。我们推测,在训练的早期阶段,步数的数量对于训练更为重要。大多数视频的分辨率是**240p**,预训练时使用的配置与[stage2.py](/configs/opensora-v1-1/train/stage2.py)相似。
3. **[阶段一]** 为了增加训练步数我们改用了更小的批量大小并且没有使用梯度检查点技术。在这个阶段我们还引入了帧率fps条件。模型训练了**40,000**步,持续了**2天**。训练中使用的视频大多数是**144p**分辨率,使用的配置文件是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。我们使用较低的分辨率因为我们在Open-Sora 1.0版本中发现模型可以以相对较低的分辨率学习时间知识。
4. **[阶段一]** 我们发现模型不能很好地学习长视频并在Open-Sora1.0训练中发现了一个噪声生成结果推测是半精度问题。因此我们采用QK-归一化来稳定训练。我们还将iddpm-speed切换成iddpm。我们训练了**17k**步**14小时**。大多数视频的分辨率是144p预训练时使用的配置是[stage1.py](/configs/opensora-v1-1/train/stage1.py)。阶段1训练持续约一周总步长**81k**。
5. **[阶段二]** 我们切换到更高的分辨率,其中大多数视频是**240p和480p**分辨率([stage2.py](/configs/opensora-v1-1/train/stage2.py))。我们在所有预训练数据上训练了**22000**步,持续**一天**。
6. **[阶段三]** 我们切换到更高的分辨率,大多数视频的分辨率是**480p和720p**[stage3.py](/configs/opensora-v1-1/train/stage3.py))。我们在高质量数据上训了**4000**步,用时**一天**。
## 结果和评价
## 不足和下一步计划
随着我们离Sora的复现又近了一步我们发现当前模型存在许多不足这些不足将在我们下阶段工作中得到改善。
- **噪音的生成和影响**我们发现生成的模型特别是长视频中有时很多噪点不流畅。我们认为问题在于没有使用时间VAE。由于[Pixart-Sigma](https://arxiv.org/abs/2403.04692)发现适应新VAE很容易我们计划在下一个版本中为模型开发时间VAE。
- **缺乏时间一致性**我们发现模型无法生成具有高时间一致性的视频我们认为问题是由于缺乏训练FLOPs我们计划收集更多数据并继续训练模型以提高时间一致性。
- **人像生成质量低**:我们发现模型无法生成高质量的人类视频,我们认为问题是由于缺乏人类数据,我们计划收集更多的人类数据,并继续训练模型以提高人类生成。
- **美学得分低**我们发现模型的美学得分不高。问题在于缺少美学得分过滤由于IO瓶颈没我们没有进行这一步骤。我们计划通过美学得分和微调模型来过滤数据以提高美学得分。
- **长视频生成质量低**:我们发现,使用同样的提示词,视频越长,质量越差。这意味着图像质量不能同等地被不同长度的序列所适应。
> - **算法与加速实现**Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
> - **数据收集与处理**Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu

159
docs/zh_CN/report_v3.md Normal file
View File

@ -0,0 +1,159 @@
# Open-Sora 1.2 报告
- [视频压缩网络](#视频压缩网络)
- [整流流和模型适应](#整流流和模型适应)
- [更多数据和更好的多阶段训练](#更多数据和更好的多阶段训练)
- [简单有效的模型调节](#简单有效的模型调节)
- [评估](#评估)
在 Open-Sora 1.2 版本中,我们在 >30M 数据上训练了 一个1.1B 的模型,支持 0s~16s、144p 到 720p、各种宽高比的视频生成。我们的配置如下所列。继 1.1 版本之后Open-Sora 1.2 还可以进行图像到视频的生成和视频扩展。
| | 图像 | 2秒 | 4秒 | 8秒 | 16秒 |
| ---- | ----- | --- | --- | --- | --- |
| 240p | ✅ | ✅ | ✅ | ✅ | ✅ |
| 360p | ✅ | ✅ | ✅ | ✅ | ✅ |
| 480p | ✅ | ✅ | ✅ | ✅ | 🆗 |
| 720p | ✅ | ✅ | ✅ | 🆗 | 🆗 |
这里✅表示在训练期间可以看到数据🆗表示虽然没有经过训练但模型可以在该配置下进行推理。🆗的推理需要多个80G内存的GPU和序列并行。
除了 Open-Sora 1.1 中引入的功能外Open-Sora 1.2 还有以下重磅更新:
- 视频压缩网络
- 整流流训练
- 更多数据和更好的多阶段训练
- 简单有效的模型调节
- 更好的评估指标
上述改进的所有实现(包括训练和推理)均可在 Open-Sora 1.2 版本中使用。以下部分将介绍改进的细节。我们还改进了代码库和文档,使其更易于使用。
## 视频压缩网络
对于 Open-Sora 1.0 & 1.1,我们使用了 stable-ai 的 83M 2D VAE它仅在空间维度上压缩将视频压缩 8x8 倍。为了减少时间维度我们每三帧提取一帧。然而这种方法导致生成的视频流畅度较低因为牺牲了生成的帧率fps。因此在这个版本中我们引入了像 OpenAI 的 Sora 一样的视频压缩网络。该网络在时域上将视频大小压缩至四分之一,因此,我们不必再额外抽帧,而可以使用原有帧率生成模型。
考虑到训练 3D VAE 的计算成本很高,我们希望重新利用在 2D VAE 中学到的知识。我们注意到,经过 2D VAE 压缩后,时间维度上相邻的特征仍然高度相关。因此,我们提出了一个简单的视频压缩网络,首先将视频在空间维度上压缩 8x8 倍,然后将视频在时间维度上压缩 4 倍。网络如下所示:
![video_compression_network](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_3d_vae.png)
我们用[SDXL 的 VAE](https://huggingface.co/stabilityai/sdxl-vae)初始化 2D VAE ,它比我们以前使用的更好。对于 3D VAE我们采用[Magvit-v2](https://magvit.cs.cmu.edu/v2/)中的 VAE 结构,它包含 300M 个参数。加上 83M 的 2D VAE视频压缩网络的总参数为 384M。我们设定batch size 为 1 对 3D VAE 进行了 1.2M 步的训练。训练数据是来自 pixels 和 pixabay 的视频,训练视频大小主要是 17 帧256x256 分辨率。3D VAE 中使用causal convolotions使图像重建更加准确。
我们的训练包括三个阶段:
1. 对于前 380k 步,我们冻结 2D VAE并在 8 个 GPU 上进行训练。训练目标包括重建 2D VAE 的压缩特征(图中粉红色),并添加损失以使 3D VAE 的特征与 2D VAE 的特征相似粉红色和绿色称为identity loss。我们发现后者的损失可以快速使整个 VAE 在图像上取得良好的性能,并在下一阶段更快地收敛。
2. 对于接下来的 260k 步我们消除identity loss并仅学习 3D VAE。
3. 对于最后 540k 步,由于我们发现仅重建 2D VAE 的特征无法带来进一步的改进因此我们移除了loss并训练整个 VAE 来重建原始视频。此阶段在 24 个 GPU 上进行训练。
对于训练的前半部分,我们采用 20% 的图像和 80% 的视频。按照[Magvit-v2](https://magvit.cs.cmu.edu/v2/),我们使用 17 帧训练视频,同时对图像的前 16 帧进行零填充。然而,我们发现这种设置会导致长度不同于 17 帧的视频变得模糊。因此,在第 3 阶段我们使用不超过34帧长度的任意帧长度视频进行混合视频长度训练,以使我们的 VAE 对不同视频长度更具鲁棒性也就是说如果我们希望训练含有n帧的视频我们就把原视频中`34-n`帧用0进行填充。我们的 [训练](/scripts/train_vae.py)和[推理](/scripts/inference_vae.py)代码可在 Open-Sora 1.2 版本中找到。
当使用 VAE 进行扩散模型时,我们的堆叠 VAE 所需的内存较少,因为我们的 VAE 的输入已经经过压缩。我们还将输入视频拆分为几个 17 帧剪辑,以提高推理效率。我们的 VAE 与[Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md)中的另一个开源 3D VAE 性能相当。
| 模型 | 结构相似性↑ | 峰值信噪比↑ |
| ------------------ | ----- | ------ |
| Open-Sora-Plan 1.1 | 0.882 | 29.890 |
| Open-Sora 1.2 | 0.880 | 30.590 |
## 整流流和模型适应
最新的扩散模型 Stable Diffusion 3 为了获得更好的性能,采用了[rectified flow](https://github.com/gnobitab/RectifiedFlow)替代了 DDPM。可惜 SD3 的 rectified flow 训练代码没有开源。不过 Open-Sora 1.2 提供了遵循 SD3 论文的训练代码,包括:
- 基本整流流训练
- 用于训练加速的 Logit-norm 采样
- 分辨率和视频长度感知时间步长采样
对于分辨率感知的时间步长采样,我们应该对分辨率较大的图像使用更多的噪声。我们将这个想法扩展到视频生成,对长度较长的视频使用更多的噪声。
Open-Sora 1.2 从[PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma) 模型checkpoint开始。请注意此模型使用 DDPM 和 SDXL VAE 进行训练,分辨率也高得多。我们发现在小数据集上进行微调可以轻松地使模型适应我们的视频生成设置。适应过程如下,所有训练都在 8 个 GPU 上完成:
1. 多分辨率图像生成能力:我们训练模型以 20k 步生成从 144p 到 2K 的不同分辨率。
2. QK-norm我们将 QK-norm 添加到模型中并训练 18k 步。
3. 整流流:我们从离散时间 DDPM 转变为连续时间整流流并训练 10k 步。
4. 使用 logit-norm 采样和分辨率感知时间步采样的整流流:我们训练 33k 步。
5. 较小的 AdamW epsilon按照 SD3使用 QK-norm我们可以对 AdamW 使用较小的 epsilon1e-15我们训练 8k 步。
6. 新的 VAE 和 fps 调节:我们用自己的 VAE 替换原来的 VAE并将 fps 调节添加到时间步调节中,我们训练 25k 步。请注意,对每个通道进行规范化对于整流流训练非常重要。
7. 时间注意力模块:我们添加时间注意力模块,其中没有初始化投影层。我们在图像上进行 3k 步训练。
8. 仅针对具有掩码策略的视频的时间块:我们仅在视频上训练时间注意力块,步长为 38k。
经过上述调整后,我们就可以开始在视频上训练模型了。上述调整保留了原始模型生成高质量图像的能力,并未后续的视频生成提供了许多助力:
- 通过整流我们可以加速训练将视频的采样步数从100步减少到30步大大减少了推理的等待时间。
- 使用 qk-norm训练更加稳定并且可以使用积极的优化器。
- 采用新的VAE时间维度压缩了4倍使得训练更加高效。
- 该模型具有多分辨率图像生成能力,可以生成不同分辨率的视频。
## 更多数据和更好的多阶段训练
由于计算预算有限,我们精心安排了训练数据的质量从低到高,并将训练分为三个阶段。我们的训练涉及 12x8 GPU总训练时间约为 2 周, 约70k步。
### 第一阶段
我们首先在 Webvid-10M 数据集40k 小时)上训练模型,共 30k 步2 个 epoch。由于视频分辨率均低于 360p 且包含水印,因此我们首先在此数据集上进行训练。训练主要在 240p 和 360p 上进行,视频长度为 2s~16s。我们使用数据集中的原始字幕进行训练。训练配置位于[stage1.py](/configs/opensora-v1-2/train/stage1.py)中。
### 第二阶段
然后我们在 Panda-70M 数据集上训练模型。这个数据集很大,但质量参差不齐。我们使用官方的 30M 子集,其中的片段更加多样化,并过滤掉美学评分低于 4.5 的视频。这产生了一个 20M 子集,包含 41k 小时。数据集中的字幕直接用于我们的训练。训练配置位于[stage2.py](/configs/opensora-v1-2/train/stage2.py)中。
训练主要在 360p 和 480p 上进行。我们训练模型 23k 步,即 0.5 个 epoch。训练尚未完成因为我们希望我们的新模型能早日与大家见面。
### 第三阶段
在此阶段,我们从各种来源收集了 200 万个视频片段,总时长 5000 小时,其中包括:
- 来自 Pexels、Pixabay、Mixkit 等的免费授权视频。
- [MiraData](https://github.com/mira-space/MiraData):一个包含长视频的高质量数据集,主要来自游戏和城市/风景探索。
- [Vript](https://github.com/mutonix/Vript/tree/main):一个密集注释的数据集。
- 还有一些其他数据集。
MiraData 和 Vript 有来自 GPT 的字幕,而我们使用[PLLaVA](https://github.com/magic-research/PLLaVA)为其余字幕添加字幕。与只能进行单帧/图像字幕的 LLaVA 相比PLLaVA 是专门为视频字幕设计和训练的。[加速版PLLaVA](/tools/caption/README.md#pllava-captioning)已在我们的`tools/`中发布。在实践中,我们使用预训练的 PLLaVA 13B 模型,并从每个视频中选择 4 帧生成字幕,空间池化形状为 2*2。
下面显示了此阶段使用的视频数据的一些统计数据。我们提供了持续时间和分辨率的基本统计数据,以及美学分数和光流分数分布。我们还从视频字幕中提取了对象和动作的标签并计算了它们的频率。
![stats](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_video_stats.png)
![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_objects_count.png)
![object_count](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report-03_actions_count.png)
此阶段我们主要在 720p 和 1080p 上进行训练以提高模型在高清视频上的表现力。在训练中我们使用的掩码率为25%。训练配置位于[stage3.py](/configs/opensora-v1-2/train/stage3.py)中。我们对模型进行 15k 步训练,大约为 2 个 epoch。
## 简单有效的模型调节
对于第 3 阶段,我们计算每个视频片段的美学分数和运动分数。但是,由于视频片段数量较少,我们不愿意过滤掉得分较低的片段,这会导致数据集较小。相反,我们将分数附加到字幕中并将其用作条件。我们发现这种方法可以让模型了解分数并遵循分数来生成质量更好的视频。
例如,一段美学评分为 5.5、运动评分为 10 且检测到摄像头运动向左平移的视频,其字幕将为:
```plaintext
[Original Caption] aesthetic score: 5.5, motion score: 10, camera motion: pan left.
```
在推理过程中,我们还可以使用分数来调节模型。对于摄像机运动,我们仅标记了 13k 个具有高置信度的剪辑,并且摄像机运动检测模块已在我们的工具中发布。
## 评估
之前,我们仅通过人工评估来监控训练过程,因为 DDPM 训练损失与生成的视频质量没有很好的相关性。但是,对于校正流,如 SD3 中所述,我们发现训练损失与生成的视频质量有很好的相关性。因此,我们跟踪了 100 张图像和 1k 个视频的校正流评估损失。
我们从 pixabay 中抽样了 1k 个视频作为验证数据集。我们计算了不同分辨率144p、240p、360p、480p、720p下图像和不同长度的视频2s、4s、8s、16s的评估损失。对于每个设置我们等距采样 10 个时间步长。然后对所有损失取平均值。
![Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_val_loss.png)
![Video Evaluation Loss](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vid_val_loss.png)
此外,我们还会在训练过程中跟踪[VBench](https://vchitect.github.io/VBench-project/)得分。VBench 是用于短视频生成的自动视频评估基准。我们用 240p 2s 视频计算 vbench 得分。这两个指标验证了我们的模型在训练过程中持续改进。
![VBench](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/report_vbench_score.png)
所有评估代码均发布在`eval`文件夹中。查看[评估指南](/eval/README.md)了解更多详细信息。
|模型 | 总得分 | 质量得分 | 语义分数 |
| -------------- | ----------- | ------------- | -------------- |
| Open-Sora V1.0 | 75.91% | 78.81% | 64.28% |
| Open-Sora V1.2 | 79.23% | 80.71% | 73.30% |
## 序列并行
我们使用序列并行来支持长序列训练和推理。我们的实现基于Ulysses工作流程如下所示。启用序列并行后我们只需要将 `all-to-all` 通信应用于STDiT中的空间模块spatial block因为在序列维度上只有对空间信息的计算是相互依赖的。
![SP](https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/sequence_parallelism.jpeg)
目前,由于训练数据分辨率较小,我们尚未使用序列并行进行训练,我们计划在下一个版本中使用。至于推理,我们可以使用序列并行,以防您的 GPU 内存不足。下表显示,序列并行可以实现加速:
| 分辨率 | 时长 | GPU数量 | 是否启用序列并行 |用时(秒) | 加速效果/GPU |
| ---------- | ------- | -------------- | --------- | ------------ | --------------- |
| 720p | 16秒 | 1 | 否 | 547.97 | - |
| 720p | 16s秒 | 2 | 是 | 244.38 | 12% |

117
docs/zh_CN/report_v4.md Normal file
View File

@ -0,0 +1,117 @@
# Open-Sora 1.3 报告
- [视频压缩网络](#视频压缩网络)
- [升级版带位移窗口注意力的STDiT](#升级版带位移窗口注意力的STDiT)
- [简单有效的模型条件控制](#简单有效的模型条件控制)
- [评估方法](#评估方法)
在Open-Sora 1.3版本中我们在超过60M约85k小时的数据上训练了一个1.1B参数的模型训练耗时35k H100 GPU小时支持0~113帧、360p和720p分辨率以及多种宽高比的视频生成。我们的配置如下。延续1.2版本的特性Open-Sora 1.3同样支持图像到视频的生成和视频延展。
| | image | 49 frames | 65 frames | 81 frames | 97 frames | 113 frames |
| ---- | ----- | ---------- | ---------- | ---------- | --------- | ---------- |
| 360p | ✅ | ✅ | ✅ | ✅ | ✅ |✅ |
| 720p | ✅ | ✅ | ✅ | ✅ | ✅ |✅ |
这里✅表示在训练过程中已经见过的数据。
除了Open-Sora 1.2中引入的特性外Open-Sora 1.3的亮点包括:
- 视频压缩网络
- 升级版带位移窗口注意力的STDiT
- 更多数据和更好的多阶段训练
- 简单有效的模型条件控制
- 更好的评估指标
以上所有改进的实现包括训练和推理都在Open-Sora 1.3版本中提供。以下部分将详细介绍这些改进。我们还优化了代码库和文档以使其更易于使用和开发并添加了LLM优化器来[优化输入提示词](/README.md#gpt-4o-prompt-refinement)并支持更多语言。
## 视频压缩网络
在Open-Sora 1.2中视频压缩架构采用了模块化方法分别处理空间和时间维度。基于Stability AI的SDXL VAE的空间VAE压缩单个帧的空间维度。时间VAE则处理来自空间VAE的潜在表示以实现时间压缩。这种两阶段设计实现了有效的空间和时间压缩但也带来了一些限制。这些限制包括由于固定长度输入帧而导致的长视频处理效率低下、空间和时间特征之间缺乏无缝集成以及在训练和推理过程中更高的内存需求。
Open-Sora 1.3引入了统一的视频压缩方法。通过将空间和时间处理结合到单一框架中并利用诸如分块3D卷积和动态帧支持等高级特性Open-Sora 1.3实现了更好的效率、可扩展性和重建质量。以下是Open-Sora 1.3 VAE的主要改进
**1. 统一的时空处理:** 不同于使用独立的VAE进行空间和时间压缩Open-Sora 1.3采用单一的编码器-解码器结构同时处理这两个维度。这种方法消除了中间表示和空间-时间模块之间的冗余数据传输的需求。
**2. 分块3D卷积** Open-Sora 1.3在时间维度上引入了分块3D卷积支持。通过将视频分解成更小的时间块该特性实现了对更长视频序列的高效编码和解码而不会增加内存开销。这一改进解决了Open-Sora 1.2在处理大量帧时的限制,确保了更高的时间压缩灵活性。
**3. 动态微批次和微帧处理:** Open-Sora 1.3引入了新的微批次和微帧处理机制。这实现了:(1) 自适应时间重叠:时间编码和解码过程中的重叠帧帮助减少块边界的不连续性。(2) 动态帧大小支持不再局限于固定长度序列如Open-Sora 1.2中的17帧Open-Sora 1.3支持动态序列长度,使其能够适应不同的视频长度。
**4. 统一的归一化机制:** Open-Sora 1.3中的归一化过程通过可调的缩放(scale)和平移(shift)参数得到了改进确保了不同数据集间潜在空间分布的一致性。与Open-Sora 1.2特定于固定数据集的归一化不同,这个版本引入了更通用的参数并支持特定于帧的归一化策略。
#### 改进总结
| 特性 | Open-Sora 1.2 | Open-Sora 1.3 |
|---------------|---------------------------------|----------------------------------|
| **架构** | 独立的空间和时间VAE | 统一的时空VAE |
| **分块处理** | 不支持 | 支持分块3D卷积 |
| **帧长度支持** | 固定17帧 | 支持动态帧长度和重叠 |
| **归一化** | 固定参数 | 可调的缩放和平移参数 |
## 包含滑动窗口注意力的STDiT
在Open-Sora 1.2取得成功的基础上1.3版本引入了多项架构改进和新功能,以提升视频生成的质量和灵活性。本节概述了这两个版本之间的主要改进和差异。
最新的扩散模型如Stable Diffusion 3采用[rectified flow](https://github.com/gnobitab/RectifiedFlow)代替DDPM以获得更好的性能。虽然SD3的rectified flow训练代码未开源但OpenSora按照SD3论文提供了训练代码实现。OpenSora 1.2从SD3引入了几个关键策略
1. 基础的rectified flow训练实现连续时间扩散
2. Logit-norm采样用于加速训练遵循SD3论文第3.1节),优先在中等噪声水平采样时间步
3. 分辨率和视频长度感知的时间步采样遵循SD3论文第5.3.2节),对更大分辨率和更长视频使用更多噪声
在OpenSora 1.3中,我们在架构、功能和性能方面进行了显著改进:
#### 1. 位移窗口注意力机制
- 引入可配置kernel_size的基于核的局部注意力提高计算效率
- 实现类似Swin Transformer的位移窗口分区策略
- 增加带extra_pad_on_dims支持的窗口边界填充掩码处理
- 在局部窗口时间、高度、宽度内扩展3D相对位置编码
#### 2. 增强的位置编码
- 改进RoPE实现将rotation_dim降至原来的1/3以适应3D场景
- 为时间、高度和宽度维度添加独立的旋转嵌入
- 实现分辨率自适应的位置编码缩放
- 可选的空间RoPE以更好地建模空间关系
#### 3. 灵活的生成能力
- 添加I2V和V2V功能配备专门的条件控制机制
- 引入条件嵌入模块x_embedder_cond和x_embedder_cond_mask
- 零初始化条件嵌入以实现稳定训练
- 通过skip_temporal选项实现灵活的时序建模
#### 4. 性能优化
- 改进Flash Attention触发条件N > 128以提高效率
- 添加torch.scaled_dot_product_attention (SDPA)作为替代后端
- 通过改进的填充和窗口分区优化内存使用
- 通过自适应高度填充增强序列并行性
从[PixArt-Σ 2K](https://github.com/PixArt-alpha/PixArt-sigma)的适应过程保持相似,但增加了额外步骤:
[第1-7点与v1.2相同多分辨率训练、QK-norm、rectified flow、logit-norm采样、更小的AdamW epsilon、新VAE和基础时序注意力]
#### 8. 增强的时序模块
- 添加带位移窗口支持的基于核的局部注意力
- 实现带分辨率自适应缩放的3D相对位置编码
- 采用改进的初始化策略进行投影层零初始化
相比专注于基础视频生成的v1.2v1.3在三个关键领域带来了实质性改进:**1. 质量**通过位移窗口注意力和3D位置编码增强时空建模。**2. 灵活性**支持I2V/V2V任务和可配置的时序建模。**3. 效率**:优化注意力计算和内存使用
这些改进在保持v1.2核心功能的同时扩展了模型在实际应用中的能力。模型保留了使用rectified flow生成高质量图像和视频的能力同时在条件生成和长序列建模方面获得了新的优势。
## 简单有效的模型条件控制
我们对每个视频片段计算美学分数和运动分数并过滤掉得分较低的片段从而得到一个视频质量更好的数据集。此外我们将这些分数附加到标题中并用作条件控制。具体来说我们基于预定义的范围将数值分数转换为描述性语言。美学分数转换函数基于预定义范围将数值美学分数转换为描述标签低于4分标记为"terrible",依次通过"very poor"、"poor"、"fair"、"good"和"very good"6.5分或更高标记为"excellent"。同样运动分数转换函数将运动强度分数映射为描述符低于0.5分标记为"very low",依次通过"low"、"fair"、"high"和"very high"20分或更高标记为"extremely high"。我们发现这种方法可以使模型意识到这些分数并遵循分数来生成更高质量的视频。
例如对于一个美学分数为5.5运动分数为10检测到的相机运动为向左平移的视频其标题将是
```plaintext
[Original Caption] The aesthetic score is good, the motion strength is high, camera motion: pan left.
```
在推理过程中我们也可以使用这些分数来控制模型。对于相机运动我们只标记了13k个高置信度的片段相机运动检测模块已在我们的工具中发布。
## 评估方法
此前我们仅通过人工评估来监控训练过程因为DDPM训练损失与生成视频的质量相关性不高。然而对于rectified flow我们发现正如SD3所述训练损失与生成视频的质量有很好的相关性。因此我们持续跟踪100张图像和1k个视频的rectified flow评估损失。
我们从pixabay采样了1k个视频作为验证数据集。我们计算了不同分辨率360p720p下图像和不同长度视频49帧、65帧、81帧、97帧、113帧的评估损失。对于每种设置我们等距采样10个时间步。然后对所有损失取平均值。
此外,我们还在训练期间跟踪[VBench](https://vchitect.github.io/VBench-project/)分数。VBench是一个用于短视频生成的自动视频评估基准。我们使用360p 49帧视频计算vbench分数。这两个指标验证了我们的模型在训练过程中持续改进。
所有评估代码都在`eval`文件夹中发布。查看[README](/eval/README.md)获取更多详细信息。

106
eval/I2V/launch.sh Normal file
View File

@ -0,0 +1,106 @@
BASE_MODEL_PATH=$1
TRAINED_MODEL_PATH=$2
I2V_HEAD_PATH=$3
I2V_TAIL_PATH=$4
I2V_LOOP_PATH=$5
I2V_ORI_PATH=$6
if [ -z $I2V_ORI_PATH ]; then
I2V_ORI_PATH="assets/texts/i2v/prompts_ori.txt"
fi
if [ -z $I2V_HEAD_PATH ]; then
I2V_HEAD_PATH="assets/texts/i2v/prompts_head.txt"
fi
if [ -z $I2V_TAIL_PATH ]; then
I2V_TAIL_PATH="assets/texts/i2v/prompts_tail.txt"
fi
if [ -z $I2V_LOOP_PATH ]; then
I2V_LOOP_PATH="assets/texts/i2v/prompts_loop.txt"
fi
STEP_RECORD=$(basename $TRAINED_MODEL_PATH)
if [ -z $SAVE_DIR ]; then
SAVE_DIR="samples/i2v/test/${STEP_RECORD}"
fi
echo "save dir: ${SAVE_DIR}"
if [ -z $NUM_FRAMES ]; then
NUM_FRAMES=49
fi
echo "num frames: ${NUM_FRAMES}"
command="python scripts/inference_i2v.py configs/opensora-v1-3/inference/v2v.py"
# # original uncond
# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_uncond --use-sdedit False
# # trained uncond
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_uncond --use-sdedit False
# trained uncond
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_cond_none_image1osci --use-sdedit False --use-oscillation-guidance-for-image True --image-cfg-scale 1 --cond-type "none" --start-index 1 --end-index 2
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_ORI_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_cond_none_image1osci_bias0 --use-sdedit False --use-oscillation-guidance-for-image True --image-cfg-scale 1 --cond-type "none" --start-index 0 --end-index 2
# trained cond: i2v_head
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image1.5osci_text7.5osci --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 1.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image1.5osci_text7.5osci_bias0 --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 1.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# trained cond: i2v_tail
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_tail_image1.5osci_text7.5osci --use-sdedit False --cond-type i2v_tail --use-oscillation-guidance-for-image True --image-cfg-scale 1.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# trained cond: i2v_head
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image2osci_text7.5osci --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 2 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# trained cond: i2v_tail
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_tail_image2osci_text7.5osci --use-sdedit False --cond-type i2v_tail --use-oscillation-guidance-for-image True --image-cfg-scale 2 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# trained cond: i2v_head
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_head_image2.5osci_text7.5osci --use-sdedit False --cond-type i2v_head --use-oscillation-guidance-for-image True --image-cfg-scale 2.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# trained cond: i2v_tail
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_tail_image2.5osci_text7.5osci --use-sdedit False --cond-type i2v_tail --use-oscillation-guidance-for-image True --image-cfg-scale 2.5 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# trained cond: i2v_loop
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop --use-sdedit False --cond-type i2v_loop --loop 2
# # traind cond: i2v_loop
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_image1osci_text7.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-image True --image-cfg-scale 1 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# # traind cond: i2v_loop
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_image2osci_text7.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-image True --image-cfg-scale 2 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# # traind cond: i2v_loop
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_image3osci_text7.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-image True --image-cfg-scale 3 --use-oscillation-guidance-for-text True --cfg-scale 7.5
# # traind cond: i2v_loop, cfg text osci
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_text7osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-text True --cfg-scale 7
# # trained cond: i2v_loop, image text osci
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_text3.5osci_image3.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-text True --cfg-scale 3.5 --use-oscillation-guidance-for-image Tru
# # trained cond: i2v_loop, image text oscie --image-cfg-scale 3.5
# ${command} --ckpt-path ${TRAINED_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name trained_i2v_loop_text7osci_image3.5osci --use-sdedit False --cond-type i2v_loop --use-oscillation-guidance-for-text True --cfg-scale 7 --use-oscillation-guidance-for-image True --image-cfg-scale 3.5
# # base cond: i2v_head
# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_HEAD_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_i2v_head --use-sdedit False --cond-type i2v_head
# # base cond: i2v_tail
# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_TAIL_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_i2v_tail --use-sdedit False --cond-type i2v_tail
# # base cond: i2v_loop
# ${command} --ckpt-path ${BASE_MODEL_PATH} --save-dir ${SAVE_DIR} --prompt-path ${I2V_LOOP_PATH} --num-frames ${NUM_FRAMES} --resolution 360p --aspect-ratio 9:16 --sample-name base_i2v_loop --use-sdedit False --cond-type i2v_loop

114
eval/README.md Normal file
View File

@ -0,0 +1,114 @@
# Evalution
## Human evaluation
To conduct human evaluation, we need to generate various samples. We provide many prompts in `assets/texts`, and defined some test setting covering different resolution, duration and aspect ratio in `eval/sample.sh`. To facilitate the usage of multiple GPUs, we split sampling tasks into several parts.
```bash
# image (1)
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -1
# video (2a 2b 2c ...)
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -2a
# launch 8 jobs at once (you must read the script to understand the details)
bash eval/human_eval/launch.sh /path/to/ckpt num_frames model_name_for_log
```
## Rectified Flow Loss
Evaluate the rectified flow loss with the following commands.
```bash
# image
torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-3/misc/eval_loss.py --data-path /path/to/img.csv --ckpt-path /path/to/ckpt
# video
torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-3/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt
# select resolution
torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-3/misc/eval_loss.py --data-path /path/to/vid.csv --ckpt-path /path/to/ckpt --resolution 720p
```
To launch multiple jobs at once, use the following script.
```bash
bash eval/loss/launch.sh /path/to/ckpt model_name
```
To obtain an organized list of scores:
```bash
python eval/loss/tabulate_rl_loss.py --log_dir path/to/log/dir
```
## VBench
[VBench](https://github.com/Vchitect/VBench) is a benchmark for short text to video generation. We provide a script for easily generating samples required by VBench.
First, generate the relevant videos with the following commands:
```bash
# vbench task, if evaluation all set start_index to 0, end_index to 2000
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -4 start_index end_index
# Alternatively, launch 8 jobs at once (you must read the script to understand the details)
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
# for example
# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
```
After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
<!-- ```bash
bash eval/vbench/vbench.sh /path/to/video_folder /path/to/model/ckpt
``` -->
```bash
python eval/vbench/calc_vbench.py /path/to/video_folder /path/to/model/ckpt
```
Finally, we obtain the scaled scores for the model by:
```bash
python eval/vbench/tabulate_vbench_scores.py --score_dir path/to/score/dir
```
## VBench-i2v
[VBench-i2v](https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v) is a benchmark for short image to video generation (beta version).
Similarly, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies".
```bash
# Step 1: generate the relevant videos
# vbench i2v tasks, if evaluation all set start_index to 0, end_index to 2000
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -5 start_index end_index
# Alternatively, launch 8 jobs at once
bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name
# Step 2: run vbench to evaluate the generated samples
python eval/vbench_i2v/vbench_i2v.py /path/to/video_folder /path/to/model/ckpt
# Note that if you need to go to `VBench/vbench2_beta_i2v/utils.py` and change the harded-coded var `image_root` in the `load_i2v_dimension_info` function to your corresponding image folder.
# Step 3: obtain the scaled scores
python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/to/your/model/ckpt
# this will store the results under `eval/vbench_i2v` in the path/to/your/model/ckpt
```
Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
```bash
bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
# for example
# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 360p 9:16 30 2 True
# if no flow control, use "None" instead
```
## VAE
Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:
```bash
# metric can any one or list of: ssim, psnr, lpips, flolpips
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir path/to/original/videos --generated_video_dir path/to/generated/videos --device cuda --sample_fps 24 --crop_size 360 --resolution 360p --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
```

View File

@ -0,0 +1,23 @@
#!/bin/bash
set -x
set -e
TEXT_PATH=/home/data/sora_data/pixart-sigma-generated/text.txt
OUTPUT_PATH=/home/data/sora_data/pixart-sigma-generated/raw
CMD="python scripts/inference.py configs/pixart/inference/1x2048MS.py"
# LOG_BASE=logs/sample/generate
LOG_BASE=$(dirname $CKPT)/eval/generate
mkdir -p ${LOG_BASE}
NUM_PER_GPU=10000
N_LAUNCH=2
NUM_START=$(($N_LAUNCH * $NUM_PER_GPU * 8))
CUDA_VISIBLE_DEVICES=0 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 0)) --end-index $(($NUM_START + $NUM_PER_GPU * 1)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_1.log 2>&1 &
CUDA_VISIBLE_DEVICES=1 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 1)) --end-index $(($NUM_START + $NUM_PER_GPU * 2)) --image-size 1408 2816 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_2.log 2>&1 &
CUDA_VISIBLE_DEVICES=2 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 2)) --end-index $(($NUM_START + $NUM_PER_GPU * 3)) --image-size 2816 1408 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_3.log 2>&1 &
CUDA_VISIBLE_DEVICES=3 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 3)) --end-index $(($NUM_START + $NUM_PER_GPU * 4)) --image-size 1664 2304 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_4.log 2>&1 &
CUDA_VISIBLE_DEVICES=4 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 4)) --end-index $(($NUM_START + $NUM_PER_GPU * 5)) --image-size 2304 1664 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_5.log 2>&1 &
CUDA_VISIBLE_DEVICES=5 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 5)) --end-index $(($NUM_START + $NUM_PER_GPU * 6)) --image-size 1536 2560 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_6.log 2>&1 &
CUDA_VISIBLE_DEVICES=6 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 6)) --end-index $(($NUM_START + $NUM_PER_GPU * 7)) --image-size 2560 1536 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_7.log 2>&1 &
CUDA_VISIBLE_DEVICES=7 $CMD --prompt-path $TEXT_PATH --save-dir $OUTPUT_PATH --start-index $(($NUM_START + $NUM_PER_GPU * 7)) --end-index $(($NUM_START + $NUM_PER_GPU * 8)) --image-size 2048 2048 --verbose 1 --batch-size 2 >${LOG_BASE}/${N_LAUNCH}_8.log 2>&1 &

26
eval/human_eval/launch.sh Normal file
View File

@ -0,0 +1,26 @@
#!/bin/bash
CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
if [[ $CKPT == *"ema"* ]]; then
parentdir=$(dirname $CKPT)
CKPT_BASE=$(basename $parentdir)_ema
else
CKPT_BASE=$(basename $CKPT)
fi
LOG_BASE=$(dirname $CKPT)/eval
mkdir -p ${LOG_BASE}
echo "Logging to $LOG_BASE"
GPUS=(0 1 2 3 4 5 6 7)
# TASK_ID_LIST=(1 2a 2b 2c 2d 2e 2f 2g) # move image to video task
TASK_ID_LIST=(2a 2b 2c 2d 2e 2f 2g 2h)
# FRAME_LIST=(1 $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES $NUM_FRAMES)
for i in "${!GPUS[@]}"; do
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -${TASK_ID_LIST[i]} >${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
done
# kill all by: pkill -f "inference"

183
eval/loss/eval_loss.py Normal file
View File

@ -0,0 +1,183 @@
from pprint import pformat
import colossalai
import torch
import torch.distributed as dist
from colossalai.cluster import DistCoordinator
from mmengine.runner import set_random_seed
from tqdm import tqdm
from opensora.acceleration.parallel_states import get_data_parallel_group, set_data_parallel_group
from opensora.datasets.dataloader import prepare_dataloader
from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
from opensora.utils.config_utils import parse_configs
from opensora.utils.misc import create_logger, to_torch_dtype
from opensora.utils.train_utils import MaskGenerator
def main():
torch.set_grad_enabled(False)
# ======================================================
# configs & runtime variables
# ======================================================
# == parse configs ==
cfg = parse_configs(training=False)
# == device and dtype ==
device = "cuda" if torch.cuda.is_available() else "cpu"
cfg_dtype = cfg.get("dtype", "fp32")
assert cfg_dtype in ["fp16", "bf16", "fp32"], f"Unknown mixed precision {cfg_dtype}"
dtype = to_torch_dtype(cfg.get("dtype", "bf16"))
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# == init distributed env ==
colossalai.launch_from_torch({})
DistCoordinator()
set_random_seed(seed=cfg.get("seed", 1024))
set_data_parallel_group(dist.group.WORLD)
# == init logger ==
logger = create_logger()
logger.info("Eval loss configuration:\n %s", pformat(cfg.to_dict()))
# ======================================================
# build model & load weights
# ======================================================
logger.info("Building models...")
# == build text-encoder and vae ==
text_encoder = build_module(cfg.text_encoder, MODELS, device=device)
if text_encoder is not None:
text_encoder_output_dim = text_encoder.output_dim
text_encoder_model_max_length = text_encoder.model_max_length
cfg.dataset.tokenize_fn = text_encoder.tokenize_fn
else:
text_encoder_output_dim = cfg.get("text_encoder_output_dim", 4096)
text_encoder_model_max_length = cfg.get("text_encoder_model_max_length", 300)
vae = build_module(cfg.vae, MODELS).to(device, dtype).eval()
# == build diffusion model ==
input_size = (None, None, None)
latent_size = vae.get_latent_size(input_size)
model = (
build_module(
cfg.model,
MODELS,
input_size=latent_size,
in_channels=vae.out_channels,
caption_channels=text_encoder_output_dim,
model_max_length=text_encoder_model_max_length,
enable_sequence_parallelism=cfg.get("sp_size", 1) > 1,
)
.to(device, dtype)
.eval()
)
text_encoder.y_embedder = model.y_embedder # HACK: for classifier-free guidance
# == build scheduler ==
scheduler = build_module(cfg.scheduler, SCHEDULERS)
if cfg.get("mask_ratios", None) is not None:
mask_generator = MaskGenerator(cfg.mask_ratios)
# ======================================================
# inference
# ======================================================
# start evaluation, prepare a dataset everytime in the loop
bucket_config = cfg.bucket_config
if cfg.get("resolution", None) is not None:
bucket_config = {cfg.resolution: bucket_config[cfg.resolution]}
assert bucket_config is not None, "bucket_config is required for evaluation"
logger.info("Evaluating bucket_config: %s", bucket_config)
def build_dataset(resolution, num_frames, batch_size):
bucket_config = {resolution: {num_frames: (1.0, batch_size)}}
dataset = build_module(cfg.dataset, DATASETS)
dataloader_args = dict(
dataset=dataset,
batch_size=None,
num_workers=cfg.num_workers,
shuffle=False,
drop_last=False,
pin_memory=True,
process_group=get_data_parallel_group(),
)
dataloader, sampler = prepare_dataloader(bucket_config=bucket_config, **dataloader_args)
num_batch = sampler.get_num_batch()
num_steps_per_epoch = num_batch // dist.get_world_size()
return dataloader, num_steps_per_epoch, num_batch
evaluation_losses = {}
start = cfg.start_index if "start_index" in cfg else 0
end = cfg.end_index if "end_index" in cfg else len(bucket_config)
for i, res in enumerate(bucket_config):
if len(bucket_config) > 1 and (i < start or i >= end): # skip task
print("skipping:", bucket_config[res])
continue
t_bucket = bucket_config[res]
num_frames_index = 0
for num_frames, (_, batch_size) in t_bucket.items():
if batch_size is None:
continue
if len(bucket_config) == 1 and (num_frames_index < start or num_frames_index >= end): # skip task
print("skipping:", num_frames)
num_frames_index += 1
continue
else:
num_frames_index += 1
logger.info("Evaluating resolution: %s, num_frames: %s", res, num_frames)
dataloader, num_steps_per_epoch, num_batch = build_dataset(res, num_frames, batch_size)
if num_batch == 0:
logger.warning("No data for resolution: %s, num_frames: %s", res, num_frames)
continue
evaluation_t_losses = []
for t in torch.linspace(0, scheduler.num_timesteps, cfg.get("num_eval_timesteps", 10) + 2)[1:-1]:
loss_t = 0.0
num_samples = 0
dataloader_iter = iter(dataloader)
for _ in tqdm(range(num_steps_per_epoch), desc=f"res: {res}, num_frames: {num_frames}, t: {t:.2f}"):
batch = next(dataloader_iter)
x = batch.pop("video").to(device, dtype)
batch.pop("text")
x = vae.encode(x)
input_ids = batch.pop("input_ids")
attention_mask = batch.pop("attention_mask")
model_args = text_encoder.encode(input_ids, attention_mask=attention_mask)
# == mask ==
mask = None
if cfg.get("mask_ratios", None) is not None:
mask = mask_generator.get_masks(x)
model_args["x_mask"] = mask
# == video meta info ==
for k, v in batch.items():
model_args[k] = v.to(device, dtype)
# == diffusion loss computation ==
timestep = torch.tensor([t] * x.shape[0], device=device, dtype=dtype)
loss_dict = scheduler.training_losses(model, x, model_args, mask=mask, t=timestep)
losses = loss_dict["loss"] # (batch_size)
num_samples += x.shape[0]
loss_t += losses.sum().item()
loss_t /= num_samples
evaluation_t_losses.append(loss_t)
logger.info("resolution: %s, num_frames: %s, timestep: %.2f, loss: %.4f", res, num_frames, t, loss_t)
evaluation_losses[(res, num_frames)] = sum(evaluation_t_losses) / len(evaluation_t_losses)
logger.info(
"Evaluation losses for resolution: %s, num_frames: %s, loss: %s\n %s",
res,
num_frames,
evaluation_losses[(res, num_frames)],
evaluation_t_losses,
)
logger.info("Evaluation losses: %s", evaluation_losses)
if __name__ == "__main__":
main()

38
eval/loss/launch.sh Normal file
View File

@ -0,0 +1,38 @@
#!/bin/bash
CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
CKPT_PATH=$1
MODEL_NAME=$2
IMG_PATH=$3
VID_PATH=$4
if [ -z $IMG_PATH ]; then
IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
fi
if [ -z $VID_PATH ]; then
VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
fi
if [[ $CKPT_PATH == *"ema"* ]]; then
parentdir=$(dirname $CKPT_PATH)
CKPT_BASE=$(basename $parentdir)_ema
else
CKPT_BASE=$(basename $CKPT_PATH)
fi
LOG_BASE=$(dirname $CKPT_PATH)/eval
mkdir -p $LOG_BASE
echo "Logging to $LOG_BASE"
GPUS=(3 4 5 6 7)
RESOLUTION=(144p 240p 360p 480p 720p)
CUDA_VISIBLE_DEVICES=0 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 0 --end-index 5 >${LOG_BASE}/img_0.log 2>&1 &
CUDA_VISIBLE_DEVICES=1 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 5 --end-index 6 >${LOG_BASE}/img_1.log 2>&1 &
CUDA_VISIBLE_DEVICES=2 $CMD --data-path $IMG_PATH --ckpt-path $CKPT_PATH --start-index 6 >${LOG_BASE}/img_2.log 2>&1 &
for i in "${!GPUS[@]}"; do
CUDA_VISIBLE_DEVICES=${GPUS[i]} $CMD --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION[i]} >${LOG_BASE}/${RESOLUTION[i]}_vid.log 2>&1 &
done

View File

@ -0,0 +1,10 @@
#!/bin/bash
CMD_FILE_CONFIG="eval/loss/eval_loss.py configs/opensora-pro/misc/eval_loss.py"
PORTS=$1
CKPT_PATH=$2
VID_PATH=$3
# only evaluate for 360p, 102f
RESOLUTION=360p
torchrun --master-port ${PORTS} --nproc_per_node 1 $CMD_FILE_CONFIG --data-path $VID_PATH --ckpt-path $CKPT_PATH --resolution ${RESOLUTION}

View File

@ -0,0 +1,55 @@
"""
usage:
python tabulate_rl_loss.py --log_dir /home/zhengzangwei/projs/Open-Sora-dev/logs/loss --ckpt_name epoch0-global_step9000
save the processed json to:
Open-Sora-dev/evaluation_results/rectified_flow/<ckpt_name>_loss.json
"""
import argparse
import json
import os
from ast import literal_eval
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--log_dir", type=str)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
files = os.listdir(args.log_dir)
files = [
"img_0.log",
"img_1.log",
"img_2.log",
"144p_vid.log",
"240p_vid.log",
"360p_vid.log",
"480p_vid.log",
"720p_vid.log",
]
loss_info = {}
for fname in files:
path = os.path.join(args.log_dir, fname)
with open(path, "r", encoding="utf-8") as f:
content = f.readlines()
eval_line = content[-1].split("losses:")[-1].strip()
loss_dict = literal_eval(eval_line)
for key, loss in loss_dict.items():
resolution, frame = key
if resolution not in loss_info:
loss_info[resolution] = {}
loss_info[resolution][frame] = format(loss, ".4f")
# Convert and write JSON object to file
output_file_path = os.path.join(args.log_dir, "loss.json")
with open(output_file_path, "w") as outfile:
json.dump(loss_info, outfile, indent=4, sort_keys=True)
print(f"results saved to: {output_file_path}")

311
eval/sample.sh Normal file
View File

@ -0,0 +1,311 @@
# !/bin/bash
CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
TASK_TYPE=$4
VBENCH_START_INDEX=$5
VBENCH_END_INDEX=$6
VBENCH_RES=$7
VBENCH_ASP_RATIO=$8
NUM_SAMPLING_STEPS=$9
FLOW=${10}
LLM_REFINE=${11}
BASE_ASPECT_RATIO=360p
ASPECT_RATIOS=(360p 720p)
# Loop through the list of aspect ratios
i=0
for r in "${ASPECT_RATIOS[@]}"; do
if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then
# get aspect ratio 1 level up
if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then
ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]}
else
# If this is the highest ratio, return the highest ratio
ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]}
fi
# get aspect ratio 2 levels up
if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then
ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]}
else
# If this is the highest ratio, return the highest ratio
ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]}
fi
fi
i=$((i+1))
done
echo "base aspect ratio: ${BASE_ASPECT_RATIO}"
echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}"
echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}"
echo "Note that this aspect ratio level setting is used for videos only, not images"
echo "NUM_FRAMES=${NUM_FRAMES}"
if [ -z "${NUM_FRAMES}" ]; then
echo "you need to pass NUM_FRAMES"
else
let DOUBLE_FRAMES=$2*2
let QUAD_FRAMES=$2*4
let OCT_FRAMES=$2*8
fi
echo "DOUBLE_FRAMES=${DOUBLE_FRAMES}"
echo "QUAD_FRAMES=${QUAD_FRAMES}"
echo "OCT_FRAMES=${OCT_FRAMES}"
# CMD="python scripts/inference.py configs/opensora-v1-2/inference/sample.py"
CMD="python scripts/inference.py configs/opensora-v1-3/inference/t2v.py"
CMD_I2V="python scripts/inference_i2v.py configs/opensora-v1-3/inference/v2v.py"
if [[ $CKPT == *"ema"* ]]; then
parentdir=$(dirname $CKPT)
CKPT_BASE=$(basename $parentdir)_ema
else
CKPT_BASE=$(basename $CKPT)
fi
OUTPUT="/mnt/jfs-hdd/sora/samples/samples_${MODEL_NAME}_${CKPT_BASE}"
start=$(date +%s)
DEFAULT_BS=1
### Functions
# called inside run_video_b
function run_image() {
# 360p multi-sample
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_sora_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_short_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_t2v_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 360p --aspect-ratio 1:1 --sample-name image_t2i_360p_1_1 --end-index 3 --batch-size $DEFAULT_BS
# 720p multi-resolution
# 1:1
PROMPT="Bright scene, aerial view,ancient city, fantasy, gorgeous light, mirror reflection, high detail, wide angle lens."
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:1 --sample-name image_720p_1_1
# 9:16
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 9:16 --sample-name image_720p_9_16
# 16:9
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 16:9 --sample-name image_720p_16_9
# 4:3
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 4:3 --sample-name image_720p_4_3
# 3:4
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 3:4 --sample-name image_720p_3_4
# 1:2
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 1:2 --sample-name image_720p_1_2
# 2:1
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 1 --resolution 720p --aspect-ratio 2:1 --sample-name image_720p_2_1
}
function run_video_a() {
# sample, 720p, 9:16
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 97 --resolution 720p --aspect-ratio 9:16 --sample-name sample_97_720p --batch-size $DEFAULT_BS
# sample, 360p, 9:16
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 97 --resolution 360p --aspect-ratio 9:16 --sample-name sample_97_360p --batch-size $DEFAULT_BS
# sample random type, 720p, 9:16
if [[ -z "${OPENAI_API_KEY}" ]];
then
echo "Error: Required environment variable 'OPENAI_API_KEY' is not set."
exit 1
else
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/rand_types.txt --save-dir $OUTPUT --num-frames 97 --resolution 720p --aspect-ratio 9:16 --sample-name rand_types_2s_720p --batch-size $DEFAULT_BS --llm-refine True
fi
}
function run_video_b() {
echo "Inside run_video_b, running image samples..."
run_image
echo "Inside run_video_b, running video samples..."
# short, 720p, 9:16
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 97 --resolution 720p --aspect-ratio 9:16 --sample-name short_97_720p --batch-size $DEFAULT_BS
# short, 360p, 9:16
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 97 --resolution 360p --aspect-ratio 9:16 --sample-name short_97_360p --batch-size $DEFAULT_BS
}
function run_video_c() {
# 720p, multi-resolution
# 1:1
PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 1:1 --sample-name drone_cliff_prompt_720p_49_1_1
# 16:9
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 16:9 --sample-name drone_cliff_prompt_720p_49_16_9
# 9:16
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 9:16 --sample-name drone_cliff_prompt_720p_49_9_16
# 4:3
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 4:3 --sample-name drone_cliff_prompt_720p_49_4_3
# 3:4
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 3:4 --sample-name drone_cliff_prompt_720p_49_3_4
# 1:2
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 1:2 --sample-name drone_cliff_prompt_720p_49_1_2
# 2:1
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 49 --resolution 720p --aspect-ratio 2:1 --sample-name drone_cliff_prompt_720p_49_2_1
# add motion score
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \
\"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 6.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 10.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 25.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 50.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 100.0\"
# add aes score
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \
\"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 5.5\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 6.0\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 6.5\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 7.0\"
}
# vbench has 950 samples
VBENCH_BS=1
VBENCH_H=360
VBENCH_W=640
function run_vbench() {
if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--image-size $VBENCH_H $VBENCH_W \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${NUM_SAMPLING_STEPS} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${FLOW} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${LLM_REFINE} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --flow ${FLOW} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ "${FLOW}" = "None" ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATI --flow ${FLOW} --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
fi
fi
fi
fi
fi
}
# vbench-i2v has 1120 samples
VBENCH_I2V_H=360
VBENCH_I2V_W=360
function run_vbench_i2v() {
if [ -z ${VBENCH_RES} ] || [ -z ${VBENCH_ASP_RATIO} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--image-size $VBENCH_I2V_H $VBENCH_I2V_W \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${NUM_SAMPLING_STEPS} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${FLOW} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${LLM_REFINE} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --flow ${FLOW} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ "${FLOW}" = "None" ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --flow ${FLOW} --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
fi
fi
fi
fi
fi
}
### Main
for arg in "$@"; do
# image
if [[ "$arg" = -1 ]] || [[ "$arg" = --image ]]; then
echo "Running image samples..."
run_image
fi
if [[ "$arg" = -2a ]] || [[ "$arg" = --video ]]; then
echo "Running video samples a..."
run_video_a
fi
if [[ "$arg" = -2b ]] || [[ "$arg" = --video ]]; then
echo "Running video samples b..."
run_video_b
fi
if [[ "$arg" = -2c ]] || [[ "$arg" = --video ]]; then
echo "Running video samples c..."
run_video_c
fi
# vbench
if [[ "$arg" = -4 ]] || [[ "$arg" = --vbench ]]; then
echo "Running vbench samples ..."
if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then
echo "need to set start_index and end_index"
else
run_vbench $VBENCH_START_INDEX $VBENCH_END_INDEX
fi
fi
# vbench-i2v
if [[ "$arg" = -5 ]] || [[ "$arg" = --vbench-i2v ]]; then
echo "Running vbench-i2v samples ..."
if [ -z ${VBENCH_START_INDEX} ] || [ -z ${VBENCH_END_INDEX} ]; then
echo "need to set start_index and end_index"
else
run_vbench_i2v $VBENCH_START_INDEX $VBENCH_END_INDEX
fi
fi
done
### End
end=$(date +%s)
runtime=$((end - start))
echo "Runtime: $runtime seconds"

89
eval/vae/cal_flolpips.py Normal file
View File

@ -0,0 +1,89 @@
import sys
import numpy as np
import torch
from tqdm import tqdm
sys.path.append(".")
from flolpips.flolpips import FloLPIPS
from flolpips.pwcnet import Network as PWCNet
loss_fn = FloLPIPS(net="alex", version="0.1").eval().requires_grad_(False)
flownet = PWCNet().eval().requires_grad_(False)
def trans(x):
return x
def calculate_flolpips(videos1, videos2, device):
global loss_fn, flownet
print("calculate_flowlpips...")
loss_fn = loss_fn.to(device)
flownet = flownet.to(device)
if videos1.shape != videos2.shape:
print("Warning: the shape of videos are not equal.")
min_frames = min(videos1.shape[1], videos2.shape[1])
videos1 = videos1[:, :min_frames]
videos2 = videos2[:, :min_frames]
videos1 = trans(videos1)
videos2 = trans(videos2)
flolpips_results = []
for video_num in tqdm(range(videos1.shape[0])):
video1 = videos1[video_num].to(device)
video2 = videos2[video_num].to(device)
frames_rec = video1[:-1]
frames_rec_next = video1[1:]
frames_gt = video2[:-1]
frames_gt_next = video2[1:]
t, c, h, w = frames_gt.shape
flow_gt = flownet(frames_gt, frames_gt_next)
flow_dis = flownet(frames_rec, frames_rec_next)
flow_diff = flow_gt - flow_dis
flolpips = loss_fn.forward(frames_gt, frames_rec, flow_diff, normalize=True)
flolpips_results.append(flolpips.cpu().numpy().tolist())
flolpips_results = np.array(flolpips_results) # [batch_size, num_frames]
flolpips = {}
flolpips_std = {}
for clip_timestamp in range(flolpips_results.shape[1]):
flolpips[clip_timestamp] = np.mean(flolpips_results[:, clip_timestamp], axis=-1)
flolpips_std[clip_timestamp] = np.std(flolpips_results[:, clip_timestamp], axis=-1)
result = {
"value": flolpips,
"value_std": flolpips_std,
"video_setting": video1.shape,
"video_setting_name": "time, channel, heigth, width",
"result": flolpips_results,
"details": flolpips_results.tolist(),
}
return result
# test code / using example
def main():
NUMBER_OF_VIDEOS = 8
VIDEO_LENGTH = 50
CHANNEL = 3
SIZE = 64
videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
import json
result = calculate_flolpips(videos1, videos2, "cuda:0")
print(json.dumps(result, indent=4))
if __name__ == "__main__":
main()

99
eval/vae/cal_lpips.py Normal file
View File

@ -0,0 +1,99 @@
import lpips
import numpy as np
import torch
from tqdm import tqdm
spatial = True # Return a spatial map of perceptual distance.
# Linearly calibrated models (LPIPS)
loss_fn = lpips.LPIPS(net="alex", spatial=spatial) # Can also set net = 'squeeze' or 'vgg'
# loss_fn = lpips.LPIPS(net='alex', spatial=spatial, lpips=False) # Can also set net = 'squeeze' or 'vgg'
def trans(x):
# if greyscale images add channel
if x.shape[-3] == 1:
x = x.repeat(1, 1, 3, 1, 1)
# value range [0, 1] -> [-1, 1]
x = x * 2 - 1
return x
def calculate_lpips(videos1, videos2, device):
# image should be RGB, IMPORTANT: normalized to [-1,1]
print("calculate_lpips...")
assert videos1.shape == videos2.shape
# videos [batch_size, timestamps, channel, h, w]
# support grayscale input, if grayscale -> channel*3
# value range [0, 1] -> [-1, 1]
videos1 = trans(videos1)
videos2 = trans(videos2)
lpips_results = []
for video_num in tqdm(range(videos1.shape[0])):
# get a video
# video [timestamps, channel, h, w]
video1 = videos1[video_num]
video2 = videos2[video_num]
lpips_results_of_a_video = []
for clip_timestamp in range(len(video1)):
# get a img
# img [timestamps[x], channel, h, w]
# img [channel, h, w] tensor
img1 = video1[clip_timestamp].unsqueeze(0).to(device)
img2 = video2[clip_timestamp].unsqueeze(0).to(device)
loss_fn.to(device)
# calculate lpips of a video
lpips_results_of_a_video.append(loss_fn.forward(img1, img2).mean().detach().cpu().tolist())
lpips_results.append(lpips_results_of_a_video)
lpips_results = np.array(lpips_results)
lpips = {}
lpips_std = {}
for clip_timestamp in range(len(video1)):
lpips[clip_timestamp] = np.mean(lpips_results[:, clip_timestamp])
lpips_std[clip_timestamp] = np.std(lpips_results[:, clip_timestamp])
result = {
"value": lpips,
"value_std": lpips_std,
"video_setting": video1.shape,
"video_setting_name": "time, channel, heigth, width",
}
return result
# test code / using example
def main():
NUMBER_OF_VIDEOS = 8
VIDEO_LENGTH = 50
CHANNEL = 3
SIZE = 64
videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
videos2 = torch.ones(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
device = torch.device("cuda")
# device = torch.device("cpu")
import json
result = calculate_lpips(videos1, videos2, device)
print(json.dumps(result, indent=4))
if __name__ == "__main__":
main()

92
eval/vae/cal_psnr.py Normal file
View File

@ -0,0 +1,92 @@
import math
import numpy as np
import torch
from tqdm import tqdm
def img_psnr(img1, img2):
# [0,1]
# compute mse
# mse = np.mean((img1-img2)**2)
mse = np.mean((img1 / 1.0 - img2 / 1.0) ** 2)
# compute psnr
if mse < 1e-10:
return 100
psnr = 20 * math.log10(1 / math.sqrt(mse))
return psnr
def trans(x):
return x
def calculate_psnr(videos1, videos2):
print("calculate_psnr...")
# videos [batch_size, timestamps, channel, h, w]
assert videos1.shape == videos2.shape
videos1 = trans(videos1)
videos2 = trans(videos2)
psnr_results = []
for video_num in tqdm(range(videos1.shape[0])):
# get a video
# video [timestamps, channel, h, w]
video1 = videos1[video_num]
video2 = videos2[video_num]
psnr_results_of_a_video = []
for clip_timestamp in range(len(video1)):
# get a img
# img [timestamps[x], channel, h, w]
# img [channel, h, w] numpy
img1 = video1[clip_timestamp].numpy()
img2 = video2[clip_timestamp].numpy()
# calculate psnr of a video
psnr_results_of_a_video.append(img_psnr(img1, img2))
psnr_results.append(psnr_results_of_a_video)
psnr_results = np.array(psnr_results) # [batch_size, num_frames]
psnr = {}
psnr_std = {}
for clip_timestamp in range(len(video1)):
psnr[clip_timestamp] = np.mean(psnr_results[:, clip_timestamp])
psnr_std[clip_timestamp] = np.std(psnr_results[:, clip_timestamp])
result = {
"value": psnr,
"value_std": psnr_std,
"video_setting": video1.shape,
"video_setting_name": "time, channel, heigth, width",
}
return result
# test code / using example
def main():
NUMBER_OF_VIDEOS = 8
VIDEO_LENGTH = 50
CHANNEL = 3
SIZE = 64
videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
import json
result = calculate_psnr(videos1, videos2)
print(json.dumps(result, indent=4))
if __name__ == "__main__":
main()

119
eval/vae/cal_ssim.py Normal file
View File

@ -0,0 +1,119 @@
import cv2
import numpy as np
import torch
from tqdm import tqdm
def ssim(img1, img2):
C1 = 0.01**2
C2 = 0.03**2
img1 = img1.astype(np.float64)
img2 = img2.astype(np.float64)
kernel = cv2.getGaussianKernel(11, 1.5)
window = np.outer(kernel, kernel.transpose())
mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] # valid
mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
mu1_sq = mu1**2
mu2_sq = mu2**2
mu1_mu2 = mu1 * mu2
sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
return ssim_map.mean()
def calculate_ssim_function(img1, img2):
# [0,1]
# ssim is the only metric extremely sensitive to gray being compared to b/w
if not img1.shape == img2.shape:
raise ValueError("Input images must have the same dimensions.")
if img1.ndim == 2:
return ssim(img1, img2)
elif img1.ndim == 3:
if img1.shape[0] == 3:
ssims = []
for i in range(3):
ssims.append(ssim(img1[i], img2[i]))
return np.array(ssims).mean()
elif img1.shape[0] == 1:
return ssim(np.squeeze(img1), np.squeeze(img2))
else:
raise ValueError("Wrong input image dimensions.")
def trans(x):
return x
def calculate_ssim(videos1, videos2):
print("calculate_ssim...")
# videos [batch_size, timestamps, channel, h, w]
assert videos1.shape == videos2.shape
videos1 = trans(videos1)
videos2 = trans(videos2)
ssim_results = []
for video_num in tqdm(range(videos1.shape[0])):
# get a video
# video [timestamps, channel, h, w]
video1 = videos1[video_num]
video2 = videos2[video_num]
ssim_results_of_a_video = []
for clip_timestamp in range(len(video1)):
# get a img
# img [timestamps[x], channel, h, w]
# img [channel, h, w] numpy
img1 = video1[clip_timestamp].numpy()
img2 = video2[clip_timestamp].numpy()
# calculate ssim of a video
ssim_results_of_a_video.append(calculate_ssim_function(img1, img2))
ssim_results.append(ssim_results_of_a_video)
ssim_results = np.array(ssim_results)
ssim = {}
ssim_std = {}
for clip_timestamp in range(len(video1)):
ssim[clip_timestamp] = np.mean(ssim_results[:, clip_timestamp])
ssim_std[clip_timestamp] = np.std(ssim_results[:, clip_timestamp])
result = {
"value": ssim,
"value_std": ssim_std,
"video_setting": video1.shape,
"video_setting_name": "time, channel, heigth, width",
}
return result
# test code / using example
def main():
NUMBER_OF_VIDEOS = 8
VIDEO_LENGTH = 50
CHANNEL = 3
SIZE = 64
videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
torch.device("cuda")
import json
result = calculate_ssim(videos1, videos2)
print(json.dumps(result, indent=4))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,265 @@
"""Calculates the CLIP Scores
The CLIP model is a contrasitively learned language-image model. There is
an image encoder and a text encoder. It is believed that the CLIP model could
measure the similarity of cross modalities. Please find more information from
https://github.com/openai/CLIP.
The CLIP Score measures the Cosine Similarity between two embedded features.
This repository utilizes the pretrained CLIP Model to calculate
the mean average of cosine similarities.
See --help to see further details.
Code apapted from https://github.com/mseitzer/pytorch-fid and https://github.com/openai/CLIP.
Copyright 2023 The Hong Kong Polytechnic University
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import json
import os
import os.path as osp
import sys
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
import numpy as np
import torch
import torchvision.transforms as transforms
from decord import VideoReader, cpu
from pytorchvideo.transforms import ShortSideScale
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision.datasets.folder import pil_loader
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import CenterCropVideo
sys.path.append(".")
from cal_flolpips import calculate_flolpips
from cal_lpips import calculate_lpips
from cal_psnr import calculate_psnr
from cal_ssim import calculate_ssim
try:
from tqdm import tqdm
except ImportError:
# If tqdm is not available, provide a mock version of it
def tqdm(x):
return x
class VideoDataset(Dataset):
def __init__(
self,
type, # image or video
real_video_dir,
generated_video_dir,
num_frames,
sample_rate=1,
crop_size=None,
resolution=128,
) -> None:
super().__init__()
self.type = type
self.real_video_files = self._combine_without_prefix(real_video_dir)
self.generated_video_files = self._combine_without_prefix(generated_video_dir)
self.num_frames = num_frames
self.sample_rate = sample_rate
self.crop_size = crop_size
self.short_size = resolution
def __len__(self):
return len(self.real_video_files)
def __getitem__(self, index):
if index >= len(self):
raise IndexError
real_video_file = self.real_video_files[index]
generated_video_file = self.generated_video_files[index]
print(real_video_file, generated_video_file)
if self.type == "video":
real_video_tensor = self._load_video(real_video_file)
generated_video_tensor = self._load_video(generated_video_file)
else:
real_video_tensor = self._load_image(real_video_file)
generated_video_tensor = self._load_image(generated_video_file)
return {"real": real_video_tensor, "generated": generated_video_tensor}
def _load_image(self, image_path):
image = pil_loader(image_path)
transform = transforms.Compose([transforms.ToTensor()])
image = transform(image)
video = image.unsqueeze(0)
video = video.permute(1, 0, 2, 3) # TCHW -> CTHW
return _preprocess(video, short_size=self.short_size, crop_size=self.crop_size)
def _load_video(self, video_path):
num_frames = self.num_frames
sample_rate = self.sample_rate
decord_vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(decord_vr)
sample_frames_len = sample_rate * num_frames
if total_frames >= sample_frames_len:
s = 0
e = s + sample_frames_len
num_frames = num_frames
else:
s = 0
e = total_frames
num_frames = int(total_frames / sample_frames_len * num_frames)
print(
f"sample_frames_len {sample_frames_len}, only can sample {num_frames * sample_rate}",
video_path,
total_frames,
)
frame_id_list = np.linspace(s, e - 1, num_frames, dtype=int)
video_data = decord_vr.get_batch(frame_id_list).asnumpy()
video_data = torch.from_numpy(video_data)
video_data = video_data.permute(0, 3, 1, 2) # (T, H, W, C) -> (C, T, H, W)
return _preprocess(video_data, short_size=self.short_size, crop_size=self.crop_size)
def _combine_without_prefix(self, folder_path, prefix="."):
folder = []
os.makedirs(folder_path, exist_ok=True)
for name in os.listdir(folder_path):
if name[0] == prefix:
continue
if osp.isfile(osp.join(folder_path, name)):
folder.append(osp.join(folder_path, name))
folder.sort()
return folder
def _preprocess(video_data, short_size=128, crop_size=None):
transform = Compose(
[
Lambda(lambda x: x / 255.0),
ShortSideScale(size=short_size),
CenterCropVideo(crop_size=crop_size),
]
)
video_outputs = transform(video_data)
# video_outputs = torch.unsqueeze(video_outputs, 0) # (bz,c,t,h,w)
return video_outputs
def calculate_common_metric(args, dataloader, device):
metric_dict = {}
if type(args.metric) is str:
args.metric = [m.strip() for m in args.metric.split(",")]
print(args.metric)
for metric in args.metric:
score_list = []
for batch_data in tqdm(dataloader): # {'real': real_video_tensor, 'generated':generated_video_tensor }
real_videos = batch_data["real"]
generated_videos = batch_data["generated"]
assert real_videos.shape[2] == generated_videos.shape[2]
if metric == "ssim":
tmp_list = list(calculate_ssim(real_videos, generated_videos)["value"].values())
elif metric == "psnr":
tmp_list = list(calculate_psnr(real_videos, generated_videos)["value"].values())
elif metric == "flolpips":
result = calculate_flolpips(real_videos, generated_videos, args.device)
tmp_list = list(result["value"].values())
elif metric == "lpips":
tmp_list = list(calculate_lpips(real_videos, generated_videos, args.device)["value"].values())
else:
print(f"metric {metric} is not in acceped list, not calculated")
continue
score_list += tmp_list
metric_dict[metric] = np.mean(score_list)
return metric_dict
def main():
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--type", type=str, choices=["video", "image"], default="video", help="whether evaluating images or videos"
)
parser.add_argument("--batch_size", type=int, default=2, help="Batch size to use")
parser.add_argument("--real_video_dir", type=str, help=("the path of real videos`"))
parser.add_argument("--generated_video_dir", type=str, help=("the path of generated videos`"))
parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu")
parser.add_argument(
"--num_workers",
type=int,
default=8,
help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"),
)
parser.add_argument("--sample_fps", type=int, default=30)
parser.add_argument("--resolution", type=int, default=336)
parser.add_argument("--crop_size", type=int, default=None)
parser.add_argument("--num_frames", type=int, default=100)
parser.add_argument("--sample_rate", type=int, default=1)
parser.add_argument("--subset_size", type=int, default=None)
# parser.add_argument("--metric", type=str, default="fvd",choices=['fvd','psnr','ssim','lpips', 'flolpips'])
parser.add_argument("--metric", nargs="+", default=[])
parser.add_argument("--fvd_method", type=str, default="styleganv", choices=["styleganv", "videogpt"])
parser.add_argument("--res_dir", type=str, default=None, help="dir to save result json")
args = parser.parse_args()
if args.device is None:
device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
else:
device = torch.device(args.device)
if args.num_workers is None:
try:
num_cpus = len(os.sched_getaffinity(0))
except AttributeError:
# os.sched_getaffinity is not available under Windows, use
# os.cpu_count instead (which may not return the *available* number
# of CPUs).
num_cpus = os.cpu_count()
num_workers = min(num_cpus, 8) if num_cpus is not None else 0
else:
num_workers = args.num_workers
dataset = VideoDataset(
args.type,
args.real_video_dir,
args.generated_video_dir,
num_frames=args.num_frames,
sample_rate=args.sample_rate,
crop_size=args.crop_size,
resolution=args.resolution,
)
if args.subset_size:
indices = range(args.subset_size)
dataset = Subset(dataset, indices=indices)
dataloader = DataLoader(dataset, args.batch_size, num_workers=num_workers, pin_memory=True)
metric_score = calculate_common_metric(args, dataloader, device)
for k, v in metric_score.items():
metric_score[k] = round(v, 3)
print("metric: ", args.metric, " ", metric_score)
if args.res_dir:
output_file_path = os.path.join(
args.res_dir, "metric_" + str(args.num_frames) + "f_" + str(args.resolution) + "res.json"
)
with open(output_file_path, "w") as outfile:
json.dump(metric_score, outfile, indent=4, sort_keys=True)
print(f"metric results saved to: {output_file_path}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,461 @@
#!/usr/bin/env python
import re
import cupy
import torch
kernel_Correlation_rearrange = """
extern "C" __global__ void kernel_Correlation_rearrange(
const int n,
const float* input,
float* output
) {
int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
if (intIndex >= n) {
return;
}
int intSample = blockIdx.z;
int intChannel = blockIdx.y;
float fltValue = input[(((intSample * SIZE_1(input)) + intChannel) * SIZE_2(input) * SIZE_3(input)) + intIndex];
__syncthreads();
int intPaddedY = (intIndex / SIZE_3(input)) + 4;
int intPaddedX = (intIndex % SIZE_3(input)) + 4;
int intRearrange = ((SIZE_3(input) + 8) * intPaddedY) + intPaddedX;
output[(((intSample * SIZE_1(output) * SIZE_2(output)) + intRearrange) * SIZE_1(input)) + intChannel] = fltValue;
}
"""
kernel_Correlation_updateOutput = """
extern "C" __global__ void kernel_Correlation_updateOutput(
const int n,
const float* rbot0,
const float* rbot1,
float* top
) {
extern __shared__ char patch_data_char[];
float *patch_data = (float *)patch_data_char;
// First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1
int x1 = blockIdx.x + 4;
int y1 = blockIdx.y + 4;
int item = blockIdx.z;
int ch_off = threadIdx.x;
// Load 3D patch into shared shared memory
for (int j = 0; j < 1; j++) { // HEIGHT
for (int i = 0; i < 1; i++) { // WIDTH
int ji_off = (j + i) * SIZE_3(rbot0);
for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS
int idx1 = ((item * SIZE_1(rbot0) + y1+j) * SIZE_2(rbot0) + x1+i) * SIZE_3(rbot0) + ch;
int idxPatchData = ji_off + ch;
patch_data[idxPatchData] = rbot0[idx1];
}
}
}
__syncthreads();
__shared__ float sum[32];
// Compute correlation
for (int top_channel = 0; top_channel < SIZE_1(top); top_channel++) {
sum[ch_off] = 0;
int s2o = top_channel % 9 - 4;
int s2p = top_channel / 9 - 4;
for (int j = 0; j < 1; j++) { // HEIGHT
for (int i = 0; i < 1; i++) { // WIDTH
int ji_off = (j + i) * SIZE_3(rbot0);
for (int ch = ch_off; ch < SIZE_3(rbot0); ch += 32) { // CHANNELS
int x2 = x1 + s2o;
int y2 = y1 + s2p;
int idxPatchData = ji_off + ch;
int idx2 = ((item * SIZE_1(rbot0) + y2+j) * SIZE_2(rbot0) + x2+i) * SIZE_3(rbot0) + ch;
sum[ch_off] += patch_data[idxPatchData] * rbot1[idx2];
}
}
}
__syncthreads();
if (ch_off == 0) {
float total_sum = 0;
for (int idx = 0; idx < 32; idx++) {
total_sum += sum[idx];
}
const int sumelems = SIZE_3(rbot0);
const int index = ((top_channel*SIZE_2(top) + blockIdx.y)*SIZE_3(top))+blockIdx.x;
top[index + item*SIZE_1(top)*SIZE_2(top)*SIZE_3(top)] = total_sum / (float)sumelems;
}
}
}
"""
kernel_Correlation_updateGradFirst = """
#define ROUND_OFF 50000
extern "C" __global__ void kernel_Correlation_updateGradFirst(
const int n,
const int intSample,
const float* rbot0,
const float* rbot1,
const float* gradOutput,
float* gradFirst,
float* gradSecond
) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
int n = intIndex % SIZE_1(gradFirst); // channels
int l = (intIndex / SIZE_1(gradFirst)) % SIZE_3(gradFirst) + 4; // w-pos
int m = (intIndex / SIZE_1(gradFirst) / SIZE_3(gradFirst)) % SIZE_2(gradFirst) + 4; // h-pos
// round_off is a trick to enable integer division with ceil, even for negative numbers
// We use a large offset, for the inner part not to become negative.
const int round_off = ROUND_OFF;
const int round_off_s1 = round_off;
// We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
int xmin = (l - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4)
int ymin = (m - 4 + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4)
// Same here:
int xmax = (l - 4 + round_off_s1) - round_off; // floor (l - 4)
int ymax = (m - 4 + round_off_s1) - round_off; // floor (m - 4)
float sum = 0;
if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) {
xmin = max(0,xmin);
xmax = min(SIZE_3(gradOutput)-1,xmax);
ymin = max(0,ymin);
ymax = min(SIZE_2(gradOutput)-1,ymax);
for (int p = -4; p <= 4; p++) {
for (int o = -4; o <= 4; o++) {
// Get rbot1 data:
int s2o = o;
int s2p = p;
int idxbot1 = ((intSample * SIZE_1(rbot0) + (m+s2p)) * SIZE_2(rbot0) + (l+s2o)) * SIZE_3(rbot0) + n;
float bot1tmp = rbot1[idxbot1]; // rbot1[l+s2o,m+s2p,n]
// Index offset for gradOutput in following loops:
int op = (p+4) * 9 + (o+4); // index[o,p]
int idxopoffset = (intSample * SIZE_1(gradOutput) + op);
for (int y = ymin; y <= ymax; y++) {
for (int x = xmin; x <= xmax; x++) {
int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p]
sum += gradOutput[idxgradOutput] * bot1tmp;
}
}
}
}
}
const int sumelems = SIZE_1(gradFirst);
const int bot0index = ((n * SIZE_2(gradFirst)) + (m-4)) * SIZE_3(gradFirst) + (l-4);
gradFirst[bot0index + intSample*SIZE_1(gradFirst)*SIZE_2(gradFirst)*SIZE_3(gradFirst)] = sum / (float)sumelems;
} }
"""
kernel_Correlation_updateGradSecond = """
#define ROUND_OFF 50000
extern "C" __global__ void kernel_Correlation_updateGradSecond(
const int n,
const int intSample,
const float* rbot0,
const float* rbot1,
const float* gradOutput,
float* gradFirst,
float* gradSecond
) { for (int intIndex = (blockIdx.x * blockDim.x) + threadIdx.x; intIndex < n; intIndex += blockDim.x * gridDim.x) {
int n = intIndex % SIZE_1(gradSecond); // channels
int l = (intIndex / SIZE_1(gradSecond)) % SIZE_3(gradSecond) + 4; // w-pos
int m = (intIndex / SIZE_1(gradSecond) / SIZE_3(gradSecond)) % SIZE_2(gradSecond) + 4; // h-pos
// round_off is a trick to enable integer division with ceil, even for negative numbers
// We use a large offset, for the inner part not to become negative.
const int round_off = ROUND_OFF;
const int round_off_s1 = round_off;
float sum = 0;
for (int p = -4; p <= 4; p++) {
for (int o = -4; o <= 4; o++) {
int s2o = o;
int s2p = p;
//Get X,Y ranges and clamp
// We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
int xmin = (l - 4 - s2o + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o)
int ymin = (m - 4 - s2p + round_off_s1 - 1) + 1 - round_off; // ceil (l - 4 - s2o)
// Same here:
int xmax = (l - 4 - s2o + round_off_s1) - round_off; // floor (l - 4 - s2o)
int ymax = (m - 4 - s2p + round_off_s1) - round_off; // floor (m - 4 - s2p)
if (xmax>=0 && ymax>=0 && (xmin<=SIZE_3(gradOutput)-1) && (ymin<=SIZE_2(gradOutput)-1)) {
xmin = max(0,xmin);
xmax = min(SIZE_3(gradOutput)-1,xmax);
ymin = max(0,ymin);
ymax = min(SIZE_2(gradOutput)-1,ymax);
// Get rbot0 data:
int idxbot0 = ((intSample * SIZE_1(rbot0) + (m-s2p)) * SIZE_2(rbot0) + (l-s2o)) * SIZE_3(rbot0) + n;
float bot0tmp = rbot0[idxbot0]; // rbot1[l+s2o,m+s2p,n]
// Index offset for gradOutput in following loops:
int op = (p+4) * 9 + (o+4); // index[o,p]
int idxopoffset = (intSample * SIZE_1(gradOutput) + op);
for (int y = ymin; y <= ymax; y++) {
for (int x = xmin; x <= xmax; x++) {
int idxgradOutput = (idxopoffset * SIZE_2(gradOutput) + y) * SIZE_3(gradOutput) + x; // gradOutput[x,y,o,p]
sum += gradOutput[idxgradOutput] * bot0tmp;
}
}
}
}
}
const int sumelems = SIZE_1(gradSecond);
const int bot1index = ((n * SIZE_2(gradSecond)) + (m-4)) * SIZE_3(gradSecond) + (l-4);
gradSecond[bot1index + intSample*SIZE_1(gradSecond)*SIZE_2(gradSecond)*SIZE_3(gradSecond)] = sum / (float)sumelems;
} }
"""
def cupy_kernel(strFunction, objVariables):
strKernel = globals()[strFunction]
while True:
objMatch = re.search("(SIZE_)([0-4])(\()([^\)]*)(\))", strKernel)
if objMatch is None:
break
# end
intArg = int(objMatch.group(2))
strTensor = objMatch.group(4)
intSizes = objVariables[strTensor].size()
strKernel = strKernel.replace(objMatch.group(), str(intSizes[intArg]))
# end
while True:
objMatch = re.search("(VALUE_)([0-4])(\()([^\)]+)(\))", strKernel)
if objMatch is None:
break
# end
intArgs = int(objMatch.group(2))
strArgs = objMatch.group(4).split(",")
strTensor = strArgs[0]
intStrides = objVariables[strTensor].stride()
strIndex = [
"(("
+ strArgs[intArg + 1].replace("{", "(").replace("}", ")").strip()
+ ")*"
+ str(intStrides[intArg])
+ ")"
for intArg in range(intArgs)
]
strKernel = strKernel.replace(objMatch.group(0), strTensor + "[" + str.join("+", strIndex) + "]")
# end
return strKernel
# end
@cupy.memoize(for_each_device=True)
def cupy_launch(strFunction, strKernel):
return cupy.RawKernel(strKernel, strFunction)
# end
class _FunctionCorrelation(torch.autograd.Function):
@staticmethod
def forward(self, first, second):
rbot0 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]])
rbot1 = first.new_zeros([first.shape[0], first.shape[2] + 8, first.shape[3] + 8, first.shape[1]])
self.save_for_backward(first, second, rbot0, rbot1)
first = first.contiguous()
assert first.is_cuda == True
second = second.contiguous()
assert second.is_cuda == True
output = first.new_zeros([first.shape[0], 81, first.shape[2], first.shape[3]])
if first.is_cuda == True:
n = first.shape[2] * first.shape[3]
cupy_launch(
"kernel_Correlation_rearrange",
cupy_kernel("kernel_Correlation_rearrange", {"input": first, "output": rbot0}),
)(
grid=tuple([int((n + 16 - 1) / 16), first.shape[1], first.shape[0]]),
block=tuple([16, 1, 1]),
args=[n, first.data_ptr(), rbot0.data_ptr()],
)
n = second.shape[2] * second.shape[3]
cupy_launch(
"kernel_Correlation_rearrange",
cupy_kernel("kernel_Correlation_rearrange", {"input": second, "output": rbot1}),
)(
grid=tuple([int((n + 16 - 1) / 16), second.shape[1], second.shape[0]]),
block=tuple([16, 1, 1]),
args=[n, second.data_ptr(), rbot1.data_ptr()],
)
n = output.shape[1] * output.shape[2] * output.shape[3]
cupy_launch(
"kernel_Correlation_updateOutput",
cupy_kernel("kernel_Correlation_updateOutput", {"rbot0": rbot0, "rbot1": rbot1, "top": output}),
)(
grid=tuple([output.shape[3], output.shape[2], output.shape[0]]),
block=tuple([32, 1, 1]),
shared_mem=first.shape[1] * 4,
args=[n, rbot0.data_ptr(), rbot1.data_ptr(), output.data_ptr()],
)
elif first.is_cuda == False:
raise NotImplementedError()
# end
return output
# end
@staticmethod
def backward(self, gradOutput):
first, second, rbot0, rbot1 = self.saved_tensors
gradOutput = gradOutput.contiguous()
assert gradOutput.is_cuda == True
gradFirst = (
first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]])
if self.needs_input_grad[0] == True
else None
)
gradSecond = (
first.new_zeros([first.shape[0], first.shape[1], first.shape[2], first.shape[3]])
if self.needs_input_grad[1] == True
else None
)
if first.is_cuda == True:
if gradFirst is not None:
for intSample in range(first.shape[0]):
n = first.shape[1] * first.shape[2] * first.shape[3]
cupy_launch(
"kernel_Correlation_updateGradFirst",
cupy_kernel(
"kernel_Correlation_updateGradFirst",
{
"rbot0": rbot0,
"rbot1": rbot1,
"gradOutput": gradOutput,
"gradFirst": gradFirst,
"gradSecond": None,
},
),
)(
grid=tuple([int((n + 512 - 1) / 512), 1, 1]),
block=tuple([512, 1, 1]),
args=[
n,
intSample,
rbot0.data_ptr(),
rbot1.data_ptr(),
gradOutput.data_ptr(),
gradFirst.data_ptr(),
None,
],
)
# end
# end
if gradSecond is not None:
for intSample in range(first.shape[0]):
n = first.shape[1] * first.shape[2] * first.shape[3]
cupy_launch(
"kernel_Correlation_updateGradSecond",
cupy_kernel(
"kernel_Correlation_updateGradSecond",
{
"rbot0": rbot0,
"rbot1": rbot1,
"gradOutput": gradOutput,
"gradFirst": None,
"gradSecond": gradSecond,
},
),
)(
grid=tuple([int((n + 512 - 1) / 512), 1, 1]),
block=tuple([512, 1, 1]),
args=[
n,
intSample,
rbot0.data_ptr(),
rbot1.data_ptr(),
gradOutput.data_ptr(),
None,
gradSecond.data_ptr(),
],
)
# end
# end
elif first.is_cuda == False:
raise NotImplementedError()
# end
return gradFirst, gradSecond
# end
# end
def FunctionCorrelation(tenFirst, tenSecond):
return _FunctionCorrelation.apply(tenFirst, tenSecond)
# end
class ModuleCorrelation(torch.nn.Module):
def __init__(self):
super(ModuleCorrelation, self).__init__()
# end
def forward(self, tenFirst, tenSecond):
return _FunctionCorrelation.apply(tenFirst, tenSecond)
# end
# end

View File

@ -0,0 +1,412 @@
from __future__ import absolute_import
import hashlib
import os
import requests
import torch
import torch.nn
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
from .pretrained_networks import alexnet, squeezenet, vgg16
from .pwcnet import Network as PWCNet
from .utils import *
URL_MAP = {"alex": "https://raw.githubusercontent.com/danier97/flolpips/main/weights/v0.1/alex.pth"}
CKPT_MAP = {"alex": "alex.pth"}
MD5_MAP = {"alex": "9642209e2b57a85d20f86d812320f9e6"}
def spatial_average(in_tens, keepdim=True):
return in_tens.mean([2, 3], keepdim=keepdim)
def mw_spatial_average(in_tens, flow, keepdim=True):
_, _, h, w = in_tens.shape
flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2)
flow_mag = flow_mag / torch.sum(flow_mag, dim=[1, 2, 3], keepdim=True)
return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim)
def mtw_spatial_average(in_tens, flow, texture, keepdim=True):
_, _, h, w = in_tens.shape
flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
texture = F.interpolate(texture, (h, w), align_corners=False, mode="bilinear")
flow_mag = torch.sqrt(flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2)
flow_mag = (flow_mag - flow_mag.min()) / (flow_mag.max() - flow_mag.min()) + 1e-6
texture = (texture - texture.min()) / (texture.max() - texture.min()) + 1e-6
weight = flow_mag / texture
weight /= torch.sum(weight)
return torch.sum(in_tens * weight, dim=[2, 3], keepdim=keepdim)
def m2w_spatial_average(in_tens, flow, keepdim=True):
_, _, h, w = in_tens.shape
flow = F.interpolate(flow, (h, w), align_corners=False, mode="bilinear")
flow_mag = flow[:, 0:1] ** 2 + flow[:, 1:2] ** 2 # B,1,H,W
flow_mag = flow_mag / torch.sum(flow_mag)
return torch.sum(in_tens * flow_mag, dim=[2, 3], keepdim=keepdim)
def upsample(in_tens, out_HW=(64, 64)): # assumes scale factor is same for H and W
in_H, in_W = in_tens.shape[2], in_tens.shape[3]
return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens)
def md5_hash(path):
with open(path, "rb") as f:
content = f.read()
return hashlib.md5(content).hexdigest()
def download(url, local_path, chunk_size=1024):
os.makedirs(os.path.split(local_path)[0], exist_ok=True)
with requests.get(url, stream=True) as r:
total_size = int(r.headers.get("content-length", 0))
with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
with open(local_path, "wb") as f:
for data in r.iter_content(chunk_size=chunk_size):
if data:
f.write(data)
pbar.update(chunk_size)
def get_ckpt_path(name, root, check=False):
assert name in URL_MAP
path = os.path.join(root, CKPT_MAP[name])
if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
download(URL_MAP[name], path)
md5 = md5_hash(path)
assert md5 == MD5_MAP[name], md5
return path
# Learned perceptual metric
class LPIPS(nn.Module):
def __init__(
self,
pretrained=True,
net="alex",
version="0.1",
lpips=True,
spatial=False,
pnet_rand=False,
pnet_tune=False,
use_dropout=True,
model_path=None,
eval_mode=True,
verbose=False,
):
# lpips - [True] means with linear calibration on top of base network
# pretrained - [True] means load linear weights
super(LPIPS, self).__init__()
if verbose:
print(
"Setting up [%s] perceptual loss: trunk [%s], v[%s], spatial [%s]"
% ("LPIPS" if lpips else "baseline", net, version, "on" if spatial else "off")
)
self.pnet_type = net
self.pnet_tune = pnet_tune
self.pnet_rand = pnet_rand
self.spatial = spatial
self.lpips = lpips # false means baseline of just averaging all layers
self.version = version
self.scaling_layer = ScalingLayer()
if self.pnet_type in ["vgg", "vgg16"]:
net_type = vgg16
self.chns = [64, 128, 256, 512, 512]
elif self.pnet_type == "alex":
net_type = alexnet
self.chns = [64, 192, 384, 256, 256]
elif self.pnet_type == "squeeze":
net_type = squeezenet
self.chns = [64, 128, 256, 384, 384, 512, 512]
self.L = len(self.chns)
self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
if lpips:
self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
if self.pnet_type == "squeeze": # 7 layers for squeezenet
self.lin5 = NetLinLayer(self.chns[5], use_dropout=use_dropout)
self.lin6 = NetLinLayer(self.chns[6], use_dropout=use_dropout)
self.lins += [self.lin5, self.lin6]
self.lins = nn.ModuleList(self.lins)
if pretrained:
self.load_from_pretrained(version, net)
if verbose:
print("Loaded model from: %s" % model_path)
if eval_mode:
self.eval()
def load_from_pretrained(self, version, net):
ckpt = get_ckpt_path(net, "pretrained_models/flolpips/weights/v%s" % (version))
self.load_state_dict(torch.load(ckpt, map_location="cpu"), strict=False)
def forward(self, in0, in1, retPerLayer=False, normalize=False):
if normalize: # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
in0 = 2 * in0 - 1
in1 = 2 * in1 - 1
# v0.0 - original release had a bug, where input was not scaled
in0_input, in1_input = (
(self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1)
)
outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
feats0, feats1, diffs = {}, {}, {}
for kk in range(self.L):
feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
if self.lpips:
if self.spatial:
res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)]
else:
res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
else:
if self.spatial:
res = [upsample(diffs[kk].sum(dim=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)]
else:
res = [spatial_average(diffs[kk].sum(dim=1, keepdim=True), keepdim=True) for kk in range(self.L)]
# val = res[0]
# for l in range(1,self.L):
# val += res[l]
# print(val)
# a = spatial_average(self.lins[kk](diffs[kk]), keepdim=True)
# b = torch.max(self.lins[kk](feats0[kk]**2))
# for kk in range(self.L):
# a += spatial_average(self.lins[kk](diffs[kk]), keepdim=True)
# b = torch.max(b,torch.max(self.lins[kk](feats0[kk]**2)))
# a = a/self.L
# from IPython import embed
# embed()
# return 10*torch.log10(b/a)
# if(retPerLayer):
# return (val, res)
# else:
return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False)
class ScalingLayer(nn.Module):
def __init__(self):
super(ScalingLayer, self).__init__()
self.register_buffer("shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None])
self.register_buffer("scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None])
def forward(self, inp):
return (inp - self.shift) / self.scale
class NetLinLayer(nn.Module):
"""A single linear layer which does a 1x1 conv"""
def __init__(self, chn_in, chn_out=1, use_dropout=False):
super(NetLinLayer, self).__init__()
layers = (
[
nn.Dropout(),
]
if (use_dropout)
else []
)
layers += [
nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
]
self.model = nn.Sequential(*layers)
def forward(self, x):
return self.model(x)
class Dist2LogitLayer(nn.Module):
"""takes 2 distances, puts through fc layers, spits out value between [0,1] (if use_sigmoid is True)"""
def __init__(self, chn_mid=32, use_sigmoid=True):
super(Dist2LogitLayer, self).__init__()
layers = [
nn.Conv2d(5, chn_mid, 1, stride=1, padding=0, bias=True),
]
layers += [
nn.LeakyReLU(0.2, True),
]
layers += [
nn.Conv2d(chn_mid, chn_mid, 1, stride=1, padding=0, bias=True),
]
layers += [
nn.LeakyReLU(0.2, True),
]
layers += [
nn.Conv2d(chn_mid, 1, 1, stride=1, padding=0, bias=True),
]
if use_sigmoid:
layers += [
nn.Sigmoid(),
]
self.model = nn.Sequential(*layers)
def forward(self, d0, d1, eps=0.1):
return self.model.forward(torch.cat((d0, d1, d0 - d1, d0 / (d1 + eps), d1 / (d0 + eps)), dim=1))
class BCERankingLoss(nn.Module):
def __init__(self, chn_mid=32):
super(BCERankingLoss, self).__init__()
self.net = Dist2LogitLayer(chn_mid=chn_mid)
# self.parameters = list(self.net.parameters())
self.loss = torch.nn.BCELoss()
def forward(self, d0, d1, judge):
per = (judge + 1.0) / 2.0
self.logit = self.net.forward(d0, d1)
return self.loss(self.logit, per)
# L2, DSSIM metrics
class FakeNet(nn.Module):
def __init__(self, use_gpu=True, colorspace="Lab"):
super(FakeNet, self).__init__()
self.use_gpu = use_gpu
self.colorspace = colorspace
class L2(FakeNet):
def forward(self, in0, in1, retPerLayer=None):
assert in0.size()[0] == 1 # currently only supports batchSize 1
if self.colorspace == "RGB":
(N, C, X, Y) = in0.size()
value = torch.mean(
torch.mean(torch.mean((in0 - in1) ** 2, dim=1).view(N, 1, X, Y), dim=2).view(N, 1, 1, Y), dim=3
).view(N)
return value
elif self.colorspace == "Lab":
value = l2(
tensor2np(tensor2tensorlab(in0.data, to_norm=False)),
tensor2np(tensor2tensorlab(in1.data, to_norm=False)),
range=100.0,
).astype("float")
ret_var = Variable(torch.Tensor((value,)))
if self.use_gpu:
ret_var = ret_var.cuda()
return ret_var
class DSSIM(FakeNet):
def forward(self, in0, in1, retPerLayer=None):
assert in0.size()[0] == 1 # currently only supports batchSize 1
if self.colorspace == "RGB":
value = dssim(1.0 * tensor2im(in0.data), 1.0 * tensor2im(in1.data), range=255.0).astype("float")
elif self.colorspace == "Lab":
value = dssim(
tensor2np(tensor2tensorlab(in0.data, to_norm=False)),
tensor2np(tensor2tensorlab(in1.data, to_norm=False)),
range=100.0,
).astype("float")
ret_var = Variable(torch.Tensor((value,)))
if self.use_gpu:
ret_var = ret_var.cuda()
return ret_var
def print_network(net):
num_params = 0
for param in net.parameters():
num_params += param.numel()
print("Network", net)
print("Total number of parameters: %d" % num_params)
class FloLPIPS(LPIPS):
def __init__(
self,
pretrained=True,
net="alex",
version="0.1",
lpips=True,
spatial=False,
pnet_rand=False,
pnet_tune=False,
use_dropout=True,
model_path=None,
eval_mode=True,
verbose=False,
):
super(FloLPIPS, self).__init__(
pretrained, net, version, lpips, spatial, pnet_rand, pnet_tune, use_dropout, model_path, eval_mode, verbose
)
def forward(self, in0, in1, flow, retPerLayer=False, normalize=False):
if normalize: # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
in0 = 2 * in0 - 1
in1 = 2 * in1 - 1
in0_input, in1_input = (
(self.scaling_layer(in0), self.scaling_layer(in1)) if self.version == "0.1" else (in0, in1)
)
outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
feats0, feats1, diffs = {}, {}, {}
for kk in range(self.L):
feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
res = [mw_spatial_average(self.lins[kk](diffs[kk]), flow, keepdim=True) for kk in range(self.L)]
return torch.sum(torch.cat(res, 1), dim=(1, 2, 3), keepdims=False)
class Flolpips(nn.Module):
def __init__(self):
super(Flolpips, self).__init__()
self.loss_fn = FloLPIPS(net="alex", version="0.1")
self.flownet = PWCNet()
@torch.no_grad()
def forward(self, I0, I1, frame_dis, frame_ref):
"""
args:
I0: first frame of the triplet, shape: [B, C, H, W]
I1: third frame of the triplet, shape: [B, C, H, W]
frame_dis: prediction of the intermediate frame, shape: [B, C, H, W]
frame_ref: ground-truth of the intermediate frame, shape: [B, C, H, W]
"""
assert (
I0.size() == I1.size() == frame_dis.size() == frame_ref.size()
), "the 4 input tensors should have same size"
flow_ref = self.flownet(frame_ref, I0)
flow_dis = self.flownet(frame_dis, I0)
flow_diff = flow_ref - flow_dis
flolpips_wrt_I0 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True)
flow_ref = self.flownet(frame_ref, I1)
flow_dis = self.flownet(frame_dis, I1)
flow_diff = flow_ref - flow_dis
flolpips_wrt_I1 = self.loss_fn.forward(frame_ref, frame_dis, flow_diff, normalize=True)
flolpips = (flolpips_wrt_I0 + flolpips_wrt_I1) / 2
return flolpips

View File

@ -0,0 +1,182 @@
from collections import namedtuple
import torch
from torchvision import models as tv
class squeezenet(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True):
super(squeezenet, self).__init__()
pretrained_features = tv.squeezenet1_1(pretrained=pretrained).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()
self.slice5 = torch.nn.Sequential()
self.slice6 = torch.nn.Sequential()
self.slice7 = torch.nn.Sequential()
self.N_slices = 7
for x in range(2):
self.slice1.add_module(str(x), pretrained_features[x])
for x in range(2, 5):
self.slice2.add_module(str(x), pretrained_features[x])
for x in range(5, 8):
self.slice3.add_module(str(x), pretrained_features[x])
for x in range(8, 10):
self.slice4.add_module(str(x), pretrained_features[x])
for x in range(10, 11):
self.slice5.add_module(str(x), pretrained_features[x])
for x in range(11, 12):
self.slice6.add_module(str(x), pretrained_features[x])
for x in range(12, 13):
self.slice7.add_module(str(x), pretrained_features[x])
if not requires_grad:
for param in self.parameters():
param.requires_grad = False
def forward(self, X):
h = self.slice1(X)
h_relu1 = h
h = self.slice2(h)
h_relu2 = h
h = self.slice3(h)
h_relu3 = h
h = self.slice4(h)
h_relu4 = h
h = self.slice5(h)
h_relu5 = h
h = self.slice6(h)
h_relu6 = h
h = self.slice7(h)
h_relu7 = h
vgg_outputs = namedtuple("SqueezeOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5", "relu6", "relu7"])
out = vgg_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5, h_relu6, h_relu7)
return out
class alexnet(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True):
super(alexnet, self).__init__()
alexnet_pretrained_features = tv.alexnet(pretrained=pretrained).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()
self.slice5 = torch.nn.Sequential()
self.N_slices = 5
for x in range(2):
self.slice1.add_module(str(x), alexnet_pretrained_features[x])
for x in range(2, 5):
self.slice2.add_module(str(x), alexnet_pretrained_features[x])
for x in range(5, 8):
self.slice3.add_module(str(x), alexnet_pretrained_features[x])
for x in range(8, 10):
self.slice4.add_module(str(x), alexnet_pretrained_features[x])
for x in range(10, 12):
self.slice5.add_module(str(x), alexnet_pretrained_features[x])
if not requires_grad:
for param in self.parameters():
param.requires_grad = False
def forward(self, X):
h = self.slice1(X)
h_relu1 = h
h = self.slice2(h)
h_relu2 = h
h = self.slice3(h)
h_relu3 = h
h = self.slice4(h)
h_relu4 = h
h = self.slice5(h)
h_relu5 = h
alexnet_outputs = namedtuple("AlexnetOutputs", ["relu1", "relu2", "relu3", "relu4", "relu5"])
out = alexnet_outputs(h_relu1, h_relu2, h_relu3, h_relu4, h_relu5)
return out
class vgg16(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True):
super(vgg16, self).__init__()
vgg_pretrained_features = tv.vgg16(pretrained=pretrained).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()
self.slice5 = torch.nn.Sequential()
self.N_slices = 5
for x in range(4):
self.slice1.add_module(str(x), vgg_pretrained_features[x])
for x in range(4, 9):
self.slice2.add_module(str(x), vgg_pretrained_features[x])
for x in range(9, 16):
self.slice3.add_module(str(x), vgg_pretrained_features[x])
for x in range(16, 23):
self.slice4.add_module(str(x), vgg_pretrained_features[x])
for x in range(23, 30):
self.slice5.add_module(str(x), vgg_pretrained_features[x])
if not requires_grad:
for param in self.parameters():
param.requires_grad = False
def forward(self, X):
h = self.slice1(X)
h_relu1_2 = h
h = self.slice2(h)
h_relu2_2 = h
h = self.slice3(h)
h_relu3_3 = h
h = self.slice4(h)
h_relu4_3 = h
h = self.slice5(h)
h_relu5_3 = h
vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
return out
class resnet(torch.nn.Module):
def __init__(self, requires_grad=False, pretrained=True, num=18):
super(resnet, self).__init__()
if num == 18:
self.net = tv.resnet18(pretrained=pretrained)
elif num == 34:
self.net = tv.resnet34(pretrained=pretrained)
elif num == 50:
self.net = tv.resnet50(pretrained=pretrained)
elif num == 101:
self.net = tv.resnet101(pretrained=pretrained)
elif num == 152:
self.net = tv.resnet152(pretrained=pretrained)
self.N_slices = 5
self.conv1 = self.net.conv1
self.bn1 = self.net.bn1
self.relu = self.net.relu
self.maxpool = self.net.maxpool
self.layer1 = self.net.layer1
self.layer2 = self.net.layer2
self.layer3 = self.net.layer3
self.layer4 = self.net.layer4
def forward(self, X):
h = self.conv1(X)
h = self.bn1(h)
h = self.relu(h)
h_relu1 = h
h = self.maxpool(h)
h = self.layer1(h)
h_conv2 = h
h = self.layer2(h)
h_conv3 = h
h = self.layer3(h)
h_conv4 = h
h = self.layer4(h)
h_conv5 = h
outputs = namedtuple("Outputs", ["relu1", "conv2", "conv3", "conv4", "conv5"])
out = outputs(h_relu1, h_conv2, h_conv3, h_conv4, h_conv5)
return out

472
eval/vae/flolpips/pwcnet.py Normal file
View File

@ -0,0 +1,472 @@
#!/usr/bin/env python
import math
import torch
# try:
from .correlation import correlation # the custom cost volume layer
# except:
# sys.path.insert(0, './correlation'); import correlation # you should consider upgrading python
# end
##########################################################
# assert(int(str('').join(torch.__version__.split('.')[0:2])) >= 13) # requires at least pytorch version 1.3.0
# torch.set_grad_enabled(False) # make sure to not compute gradients for computational performance
# torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance
# ##########################################################
# arguments_strModel = 'default' # 'default', or 'chairs-things'
# arguments_strFirst = './images/first.png'
# arguments_strSecond = './images/second.png'
# arguments_strOut = './out.flo'
# for strOption, strArgument in getopt.getopt(sys.argv[1:], '', [ strParameter[2:] + '=' for strParameter in sys.argv[1::2] ])[0]:
# if strOption == '--model' and strArgument != '': arguments_strModel = strArgument # which model to use
# if strOption == '--first' and strArgument != '': arguments_strFirst = strArgument # path to the first frame
# if strOption == '--second' and strArgument != '': arguments_strSecond = strArgument # path to the second frame
# if strOption == '--out' and strArgument != '': arguments_strOut = strArgument # path to where the output should be stored
# end
##########################################################
def backwarp(tenInput, tenFlow):
backwarp_tenGrid = {}
backwarp_tenPartial = {}
if str(tenFlow.shape) not in backwarp_tenGrid:
tenHor = (
torch.linspace(-1.0 + (1.0 / tenFlow.shape[3]), 1.0 - (1.0 / tenFlow.shape[3]), tenFlow.shape[3])
.view(1, 1, 1, -1)
.expand(-1, -1, tenFlow.shape[2], -1)
)
tenVer = (
torch.linspace(-1.0 + (1.0 / tenFlow.shape[2]), 1.0 - (1.0 / tenFlow.shape[2]), tenFlow.shape[2])
.view(1, 1, -1, 1)
.expand(-1, -1, -1, tenFlow.shape[3])
)
backwarp_tenGrid[str(tenFlow.shape)] = torch.cat([tenHor, tenVer], 1).cuda()
# end
if str(tenFlow.shape) not in backwarp_tenPartial:
backwarp_tenPartial[str(tenFlow.shape)] = tenFlow.new_ones(
[tenFlow.shape[0], 1, tenFlow.shape[2], tenFlow.shape[3]]
)
# end
tenFlow = torch.cat(
[
tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0),
],
1,
)
tenInput = torch.cat([tenInput, backwarp_tenPartial[str(tenFlow.shape)]], 1)
tenOutput = torch.nn.functional.grid_sample(
input=tenInput,
grid=(backwarp_tenGrid[str(tenFlow.shape)] + tenFlow).permute(0, 2, 3, 1),
mode="bilinear",
padding_mode="zeros",
align_corners=False,
)
tenMask = tenOutput[:, -1:, :, :]
tenMask[tenMask > 0.999] = 1.0
tenMask[tenMask < 1.0] = 0.0
return tenOutput[:, :-1, :, :] * tenMask
# end
##########################################################
class Network(torch.nn.Module):
def __init__(self):
super(Network, self).__init__()
class Extractor(torch.nn.Module):
def __init__(self):
super(Extractor, self).__init__()
self.netOne = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netTwo = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netThr = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netFou = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3, stride=2, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=96, out_channels=96, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netFiv = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=96, out_channels=128, kernel_size=3, stride=2, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netSix = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=128, out_channels=196, kernel_size=3, stride=2, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=196, out_channels=196, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
# end
def forward(self, tenInput):
tenOne = self.netOne(tenInput)
tenTwo = self.netTwo(tenOne)
tenThr = self.netThr(tenTwo)
tenFou = self.netFou(tenThr)
tenFiv = self.netFiv(tenFou)
tenSix = self.netSix(tenFiv)
return [tenOne, tenTwo, tenThr, tenFou, tenFiv, tenSix]
# end
# end
class Decoder(torch.nn.Module):
def __init__(self, intLevel):
super(Decoder, self).__init__()
intPrevious = [
None,
None,
81 + 32 + 2 + 2,
81 + 64 + 2 + 2,
81 + 96 + 2 + 2,
81 + 128 + 2 + 2,
81,
None,
][intLevel + 1]
intCurrent = [
None,
None,
81 + 32 + 2 + 2,
81 + 64 + 2 + 2,
81 + 96 + 2 + 2,
81 + 128 + 2 + 2,
81,
None,
][intLevel + 0]
if intLevel < 6:
self.netUpflow = torch.nn.ConvTranspose2d(
in_channels=2, out_channels=2, kernel_size=4, stride=2, padding=1
)
if intLevel < 6:
self.netUpfeat = torch.nn.ConvTranspose2d(
in_channels=intPrevious + 128 + 128 + 96 + 64 + 32,
out_channels=2,
kernel_size=4,
stride=2,
padding=1,
)
if intLevel < 6:
self.fltBackwarp = [None, None, None, 5.0, 2.5, 1.25, 0.625, None][intLevel + 1]
self.netOne = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=intCurrent, out_channels=128, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netTwo = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=intCurrent + 128, out_channels=128, kernel_size=3, stride=1, padding=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netThr = torch.nn.Sequential(
torch.nn.Conv2d(
in_channels=intCurrent + 128 + 128, out_channels=96, kernel_size=3, stride=1, padding=1
),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netFou = torch.nn.Sequential(
torch.nn.Conv2d(
in_channels=intCurrent + 128 + 128 + 96, out_channels=64, kernel_size=3, stride=1, padding=1
),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netFiv = torch.nn.Sequential(
torch.nn.Conv2d(
in_channels=intCurrent + 128 + 128 + 96 + 64,
out_channels=32,
kernel_size=3,
stride=1,
padding=1,
),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
)
self.netSix = torch.nn.Sequential(
torch.nn.Conv2d(
in_channels=intCurrent + 128 + 128 + 96 + 64 + 32,
out_channels=2,
kernel_size=3,
stride=1,
padding=1,
)
)
# end
def forward(self, tenFirst, tenSecond, objPrevious):
tenFlow = None
tenFeat = None
if objPrevious is None:
tenFlow = None
tenFeat = None
tenVolume = torch.nn.functional.leaky_relu(
input=correlation.FunctionCorrelation(tenFirst=tenFirst, tenSecond=tenSecond),
negative_slope=0.1,
inplace=False,
)
tenFeat = torch.cat([tenVolume], 1)
elif objPrevious is not None:
tenFlow = self.netUpflow(objPrevious["tenFlow"])
tenFeat = self.netUpfeat(objPrevious["tenFeat"])
tenVolume = torch.nn.functional.leaky_relu(
input=correlation.FunctionCorrelation(
tenFirst=tenFirst,
tenSecond=backwarp(tenInput=tenSecond, tenFlow=tenFlow * self.fltBackwarp),
),
negative_slope=0.1,
inplace=False,
)
tenFeat = torch.cat([tenVolume, tenFirst, tenFlow, tenFeat], 1)
# end
tenFeat = torch.cat([self.netOne(tenFeat), tenFeat], 1)
tenFeat = torch.cat([self.netTwo(tenFeat), tenFeat], 1)
tenFeat = torch.cat([self.netThr(tenFeat), tenFeat], 1)
tenFeat = torch.cat([self.netFou(tenFeat), tenFeat], 1)
tenFeat = torch.cat([self.netFiv(tenFeat), tenFeat], 1)
tenFlow = self.netSix(tenFeat)
return {"tenFlow": tenFlow, "tenFeat": tenFeat}
# end
# end
class Refiner(torch.nn.Module):
def __init__(self):
super(Refiner, self).__init__()
self.netMain = torch.nn.Sequential(
torch.nn.Conv2d(
in_channels=81 + 32 + 2 + 2 + 128 + 128 + 96 + 64 + 32,
out_channels=128,
kernel_size=3,
stride=1,
padding=1,
dilation=1,
),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2, dilation=2),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=4, dilation=4),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=128, out_channels=96, kernel_size=3, stride=1, padding=8, dilation=8),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=96, out_channels=64, kernel_size=3, stride=1, padding=16, dilation=16),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1, dilation=1),
torch.nn.LeakyReLU(inplace=False, negative_slope=0.1),
torch.nn.Conv2d(in_channels=32, out_channels=2, kernel_size=3, stride=1, padding=1, dilation=1),
)
# end
def forward(self, tenInput):
return self.netMain(tenInput)
# end
# end
self.netExtractor = Extractor()
self.netTwo = Decoder(2)
self.netThr = Decoder(3)
self.netFou = Decoder(4)
self.netFiv = Decoder(5)
self.netSix = Decoder(6)
self.netRefiner = Refiner()
self.load_state_dict(
{
strKey.replace("module", "net"): tenWeight
for strKey, tenWeight in torch.hub.load_state_dict_from_url(
url="http://content.sniklaus.com/github/pytorch-pwc/network-" + "default" + ".pytorch"
).items()
}
)
# end
def forward(self, tenFirst, tenSecond):
intWidth = tenFirst.shape[3]
intHeight = tenFirst.shape[2]
intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0))
intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0))
tenPreprocessedFirst = torch.nn.functional.interpolate(
input=tenFirst, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False
)
tenPreprocessedSecond = torch.nn.functional.interpolate(
input=tenSecond, size=(intPreprocessedHeight, intPreprocessedWidth), mode="bilinear", align_corners=False
)
tenFirst = self.netExtractor(tenPreprocessedFirst)
tenSecond = self.netExtractor(tenPreprocessedSecond)
objEstimate = self.netSix(tenFirst[-1], tenSecond[-1], None)
objEstimate = self.netFiv(tenFirst[-2], tenSecond[-2], objEstimate)
objEstimate = self.netFou(tenFirst[-3], tenSecond[-3], objEstimate)
objEstimate = self.netThr(tenFirst[-4], tenSecond[-4], objEstimate)
objEstimate = self.netTwo(tenFirst[-5], tenSecond[-5], objEstimate)
tenFlow = objEstimate["tenFlow"] + self.netRefiner(objEstimate["tenFeat"])
tenFlow = 20.0 * torch.nn.functional.interpolate(
input=tenFlow, size=(intHeight, intWidth), mode="bilinear", align_corners=False
)
tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth)
tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight)
return tenFlow
# end
# end
netNetwork = None
##########################################################
def estimate(tenFirst, tenSecond):
global netNetwork
if netNetwork is None:
netNetwork = Network().cuda().eval()
# end
assert tenFirst.shape[1] == tenSecond.shape[1]
assert tenFirst.shape[2] == tenSecond.shape[2]
intWidth = tenFirst.shape[2]
intHeight = tenFirst.shape[1]
assert (
intWidth == 1024
) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue
assert (
intHeight == 436
) # remember that there is no guarantee for correctness, comment this line out if you acknowledge this and want to continue
tenPreprocessedFirst = tenFirst.cuda().view(1, 3, intHeight, intWidth)
tenPreprocessedSecond = tenSecond.cuda().view(1, 3, intHeight, intWidth)
intPreprocessedWidth = int(math.floor(math.ceil(intWidth / 64.0) * 64.0))
intPreprocessedHeight = int(math.floor(math.ceil(intHeight / 64.0) * 64.0))
tenPreprocessedFirst = torch.nn.functional.interpolate(
input=tenPreprocessedFirst,
size=(intPreprocessedHeight, intPreprocessedWidth),
mode="bilinear",
align_corners=False,
)
tenPreprocessedSecond = torch.nn.functional.interpolate(
input=tenPreprocessedSecond,
size=(intPreprocessedHeight, intPreprocessedWidth),
mode="bilinear",
align_corners=False,
)
tenFlow = 20.0 * torch.nn.functional.interpolate(
input=netNetwork(tenPreprocessedFirst, tenPreprocessedSecond),
size=(intHeight, intWidth),
mode="bilinear",
align_corners=False,
)
tenFlow[:, 0, :, :] *= float(intWidth) / float(intPreprocessedWidth)
tenFlow[:, 1, :, :] *= float(intHeight) / float(intPreprocessedHeight)
return tenFlow[0, :, :, :].cpu()
# end
##########################################################
# if __name__ == '__main__':
# tenFirst = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strFirst))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))
# tenSecond = torch.FloatTensor(numpy.ascontiguousarray(numpy.array(PIL.Image.open(arguments_strSecond))[:, :, ::-1].transpose(2, 0, 1).astype(numpy.float32) * (1.0 / 255.0)))
# tenOutput = estimate(tenFirst, tenSecond)
# objOutput = open(arguments_strOut, 'wb')
# numpy.array([ 80, 73, 69, 72 ], numpy.uint8).tofile(objOutput)
# numpy.array([ tenOutput.shape[2], tenOutput.shape[1] ], numpy.int32).tofile(objOutput)
# numpy.array(tenOutput.numpy().transpose(1, 2, 0), numpy.float32).tofile(objOutput)
# objOutput.close()
# end

107
eval/vae/flolpips/utils.py Normal file
View File

@ -0,0 +1,107 @@
import cv2
import numpy as np
import torch
def normalize_tensor(in_feat, eps=1e-10):
norm_factor = torch.sqrt(torch.sum(in_feat**2, dim=1, keepdim=True))
return in_feat / (norm_factor + eps)
def l2(p0, p1, range=255.0):
return 0.5 * np.mean((p0 / range - p1 / range) ** 2)
def dssim(p0, p1, range=255.0):
from skimage.measure import compare_ssim
return (1 - compare_ssim(p0, p1, data_range=range, multichannel=True)) / 2.0
def tensor2im(image_tensor, imtype=np.uint8, cent=1.0, factor=255.0 / 2.0):
image_numpy = image_tensor[0].cpu().float().numpy()
image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + cent) * factor
return image_numpy.astype(imtype)
def tensor2np(tensor_obj):
# change dimension of a tensor object into a numpy array
return tensor_obj[0].cpu().float().numpy().transpose((1, 2, 0))
def np2tensor(np_obj):
# change dimenion of np array into tensor array
return torch.Tensor(np_obj[:, :, :, np.newaxis].transpose((3, 2, 0, 1)))
def tensor2tensorlab(image_tensor, to_norm=True, mc_only=False):
# image tensor to lab tensor
from skimage import color
img = tensor2im(image_tensor)
img_lab = color.rgb2lab(img)
if mc_only:
img_lab[:, :, 0] = img_lab[:, :, 0] - 50
if to_norm and not mc_only:
img_lab[:, :, 0] = img_lab[:, :, 0] - 50
img_lab = img_lab / 100.0
return np2tensor(img_lab)
def read_frame_yuv2rgb(stream, width, height, iFrame, bit_depth, pix_fmt="420"):
if pix_fmt == "420":
multiplier = 1
uv_factor = 2
elif pix_fmt == "444":
multiplier = 2
uv_factor = 1
else:
print("Pixel format {} is not supported".format(pix_fmt))
return
if bit_depth == 8:
datatype = np.uint8
stream.seek(iFrame * 1.5 * width * height * multiplier)
Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width))
# read chroma samples and upsample since original is 4:2:0 sampling
U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
(height // uv_factor, width // uv_factor)
)
V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
(height // uv_factor, width // uv_factor)
)
else:
datatype = np.uint16
stream.seek(iFrame * 3 * width * height * multiplier)
Y = np.fromfile(stream, dtype=datatype, count=width * height).reshape((height, width))
U = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
(height // uv_factor, width // uv_factor)
)
V = np.fromfile(stream, dtype=datatype, count=(width // uv_factor) * (height // uv_factor)).reshape(
(height // uv_factor, width // uv_factor)
)
if pix_fmt == "420":
yuv = np.empty((height * 3 // 2, width), dtype=datatype)
yuv[0:height, :] = Y
yuv[height : height + height // 4, :] = U.reshape(-1, width)
yuv[height + height // 4 :, :] = V.reshape(-1, width)
if bit_depth != 8:
yuv = (yuv / (2**bit_depth - 1) * 255).astype(np.uint8)
# convert to rgb
rgb = cv2.cvtColor(yuv, cv2.COLOR_YUV2RGB_I420)
else:
yvu = np.stack([Y, V, U], axis=2)
if bit_depth != 8:
yvu = (yvu / (2**bit_depth - 1) * 255).astype(np.uint8)
rgb = cv2.cvtColor(yvu, cv2.COLOR_YCrCb2RGB)
return rgb

55
eval/vae/launch.sh Normal file
View File

@ -0,0 +1,55 @@
#!/bin/bash
CKPT_PATH=$1
if [ -z $IMG_PATH ]; then
IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
fi
if [ -z $VID_PATH ]; then
VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
fi
if [ -z $NUM_FRAMES ]; then
NUM_FRAMES=17
fi
if [ -z $FORCE_HUGGINGFACE ]; then
FORCE_HUGGINGFACE=False
fi
if [ -z $CKPT_PATH ]; then # huggingface model
STEP_RECORD=epoch0-global_step0
LOG_DIR=outputs/OpenSoraVAE_V1_3/eval
FORCE_HUGGINGFACE=True
CKPT_PATH=pretrained_models/OpenSoraVAE_V1_3
else
if [[ -d $CKPT_PATH ]] ; then
STEP_RECORD=$(basename $CKPT_PATH)
elif [[ -f $CKPT_PATH ]]; then
STEP_RECORD=$(basename $(dirname $CKPT_PATH))
else
echo "$CKPT_PATH is not valid";
exit 1
fi
LOG_DIR=$(dirname $CKPT_PATH)/eval
fi
echo "saving losses and metrics to $LOG_DIR"
echo "video path: ${VID_PATH}"
mkdir -p $LOG_DIR
# generate video, 256x256
torchrun --standalone --nproc_per_node=1 scripts/inference_opensoravae_v1_3.py configs/vae_v1_3/inference/video_16z_256x256.py --data-path $VID_PATH --save-dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x256x256 --ckpt-path ${CKPT_PATH} --num-frames $NUM_FRAMES --force-huggingface ${FORCE_HUGGINGFACE}
# calc metrics, 17x256x256
python eval/vae/eval_common_metric.py --batch_size 4 --real_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x256x256_ori --generated_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x256x256_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames $NUM_FRAMES --sample_rate 1 --metric ssim psnr lpips flolpips --type video --res_dir ${LOG_DIR}
# # generate video, 512x512
torchrun --standalone --nproc_per_node=1 scripts/inference_opensoravae_v1_3.py configs/vae_v1_3/inference/video_16z_512x512.py --data-path $VID_PATH --save-dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x512x512 --ckpt-path ${CKPT_PATH} --num-frames $NUM_FRAMES --force-huggingface ${FORCE_HUGGINGFACE}
# # calc metrics, 17x512x512
python eval/vae/eval_common_metric.py --batch_size 4 --real_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x512x512_ori --generated_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/${NUM_FRAMES}x512x512_rec --device cuda --sample_fps 24 --crop_size 512 --resolution 512 --num_frames $NUM_FRAMES --sample_rate 1 --metric ssim psnr lpips flolpips --type video --res_dir ${LOG_DIR}
# # generate image, 1024x1024
torchrun --standalone --nproc_per_node=1 scripts/inference_opensoravae_v1_3.py configs/vae_v1_3/inference/image_16z.py --data-path $IMG_PATH --save-dir samples/opensoravae_v1_3/${STEP_RECORD}/1x1024x1024 --ckpt-path ${CKPT_PATH} --num-frames 1 --force-huggingface ${FORCE_HUGGINGFACE}
# # calc metrics, 1x1024x1024
python eval/vae/eval_common_metric.py --batch_size 4 --real_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/1x1024x1024_ori --generated_video_dir samples/opensoravae_v1_3/${STEP_RECORD}/1x1024x1024_rec --device cuda --sample_fps 1 --crop_size 1024 --resolution 1024 --num_frames 1 --sample_rate 1 --metric ssim psnr lpips --type image --res_dir ${LOG_DIR}

12
eval/vae/script/eval.sh Normal file
View File

@ -0,0 +1,12 @@
python eval/eval_common_metric.py \
--batch_size 2 \
--real_video_dir ../test_eval/release/origin \
--generated_video_dir ../test_eval/release \
--device cuda \
--sample_fps 10 \
--crop_size 256 \
--resolution 256 \
--num_frames 17 \
--sample_rate 1 \
--subset_size 100 \
--metric ssim psnr lpips flolpips

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,75 @@
import argparse
import os
import time
import torch
from vbench import VBench
full_info_path = "eval/vbench/VBench_full_info.json"
dimensions = [
# a: 10min
"subject_consistency", # 4min
"imaging_quality", # 6min
# b: 12min
"background_consistency", # 2min
"motion_smoothness", # 5min
"overall_consistency", # 2min
"human_action", # 3min
# c: 14min
"multiple_objects", # 14min
# d: 14min
"spatial_relationship", # 14min
# e: 12min
"object_class", # 12min
# f: 12min
"color", # 12min
# g: 10.5min
"aesthetic_quality", # 2.5min
"appearance_style", # 6min
"temporal_flickering", # 2min
# h: 9min
"scene", # 3min
"temporal_style", # 2min
"dynamic_degree", # 4min
]
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("video_folder", type=str) # samples/samples..._vbench/eval
parser.add_argument("model_ckpt", type=str)
parser.add_argument("--start", type=int, default=0) # start index of dimension to be evaluated
parser.add_argument("--end", type=int, default=-1) # start index of dimension to be evaluated
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
output_dir = os.path.join(args.model_ckpt, "vbench")
os.makedirs(output_dir, exist_ok=True)
video_path = args.video_folder
kwargs = {}
kwargs["imaging_quality_preprocessing_mode"] = "longer" # use VBench/evaluate.py default
start_time = time.time()
# NOTE: important to use torch.device("cuda"), else will have issue with object_class third_party module
my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
if args.end == -1: # adjust end accordingly
args.end = len(dimensions)
for dim in dimensions[args.start : args.end]:
my_VBench.evaluate(
videos_path=video_path,
name=dim,
local=False,
read_frame=False,
dimension_list=[dim],
mode="vbench_standard",
**kwargs,
)
print("Runtime: %s seconds " % (time.time() - start_time))

61
eval/vbench/launch.sh Normal file
View File

@ -0,0 +1,61 @@
# !/bin/bash
CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
RES=$4
ASP_RATIO=$5
NUM_SAMPLING_STEPS=$6
FLOW=$7
LLM_REFINE=$8
if [[ $CKPT == *"ema"* ]]; then
parentdir=$(dirname $CKPT)
CKPT_BASE=$(basename $parentdir)_ema
else
CKPT_BASE=$(basename $CKPT)
fi
# LOG_BASE=$(dirname $CKPT)/eval
LOG_BASE=./sample/eval
echo "Logging to $LOG_BASE"
GPUS=(0 1 2 3 4 5 6 7)
TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
START_INDEX_LIST=(0 120 240 360 480 600 720 840)
END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
## Modify the following to run on multiple machines for faster results
## 720p will take quite long on a single machine
# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
# LOG_BASE=$(dirname $CKPT)/eval/last_60
# mkdir -p ${LOG_BASE}
# echo "Logging to $LOG_BASE"
for i in "${!GPUS[@]}"; do
if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ;
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${NUM_SAMPLING_STEPS} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${FLOW} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${LLM_REFINE} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
fi
fi
fi
fi
done

View File

@ -0,0 +1,16 @@
# !/bin/bash
VIDEO_DIR=$1
CKPT_DIR=$2
LOG_BASE=$CKPT_DIR
mkdir -p $LOG_BASE
echo "Logging to $LOG_BASE"
GPUS=(0 1 2 3 4 5 6 7)
START_INDEX_LIST=(0 2 6 7 8 9 10 13)
END_INDEX_LIST=(2 6 7 8 9 10 13 16)
TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
for i in "${!GPUS[@]}"; do
CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
done

View File

@ -0,0 +1,155 @@
import argparse
import json
import os
SEMANTIC_WEIGHT = 1
QUALITY_WEIGHT = 4
QUALITY_LIST = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"aesthetic quality",
"imaging quality",
"dynamic degree",
]
SEMANTIC_LIST = [
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency",
]
NORMALIZE_DIC = {
"subject consistency": {"Min": 0.1462, "Max": 1.0},
"background consistency": {"Min": 0.2615, "Max": 1.0},
"temporal flickering": {"Min": 0.6293, "Max": 1.0},
"motion smoothness": {"Min": 0.706, "Max": 0.9975},
"dynamic degree": {"Min": 0.0, "Max": 1.0},
"aesthetic quality": {"Min": 0.0, "Max": 1.0},
"imaging quality": {"Min": 0.0, "Max": 1.0},
"object class": {"Min": 0.0, "Max": 1.0},
"multiple objects": {"Min": 0.0, "Max": 1.0},
"human action": {"Min": 0.0, "Max": 1.0},
"color": {"Min": 0.0, "Max": 1.0},
"spatial relationship": {"Min": 0.0, "Max": 1.0},
"scene": {"Min": 0.0, "Max": 0.8222},
"appearance style": {"Min": 0.0009, "Max": 0.2855},
"temporal style": {"Min": 0.0, "Max": 0.364},
"overall consistency": {"Min": 0.0, "Max": 0.364},
}
DIM_WEIGHT = {
"subject consistency": 1,
"background consistency": 1,
"temporal flickering": 1,
"motion smoothness": 1,
"aesthetic quality": 1,
"imaging quality": 1,
"dynamic degree": 0.5,
"object class": 1,
"multiple objects": 1,
"human action": 1,
"color": 1,
"spatial relationship": 1,
"scene": 1,
"appearance style": 1,
"temporal style": 1,
"overall consistency": 1,
}
ordered_scaled_res = [
"total score",
"quality score",
"semantic score",
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"dynamic degree",
"aesthetic quality",
"imaging quality",
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency",
]
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--score_dir", type=str) # ckpt_dir/eval/vbench
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
res_postfix = "_eval_results.json"
info_postfix = "_full_info.json"
files = os.listdir(args.score_dir)
res_files = [x for x in files if res_postfix in x]
info_files = [x for x in files if info_postfix in x]
assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"
full_results = {}
for res_file in res_files:
# first check if results is normal
info_file = res_file.split(res_postfix)[0] + info_postfix
with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
info = json.load(f)
assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
# read results
with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
data = json.load(f)
for key, val in data.items():
full_results[key] = format(val[0], ".4f")
scaled_results = {}
dims = set()
for key, val in full_results.items():
dim = key.replace("_", " ") if "_" in key else key
scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / (
NORMALIZE_DIC[dim]["Max"] - NORMALIZE_DIC[dim]["Min"]
)
scaled_score *= DIM_WEIGHT[dim]
scaled_results[dim] = scaled_score
dims.add(dim)
assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet"
quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST])
scaled_results["quality score"] = quality_score
scaled_results["semantic score"] = semantic_score
scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (
QUALITY_WEIGHT + SEMANTIC_WEIGHT
)
formated_scaled_results = {"items": []}
for key in ordered_scaled_res:
# formated_scaled_results[key] = format(val * 100, ".2f") + "%"
formated_score = format(scaled_results[key] * 100, ".2f") + "%"
formated_scaled_results["items"].append({key: formated_score})
output_file_path = os.path.join(args.score_dir, "all_results.json")
with open(output_file_path, "w") as outfile:
json.dump(full_results, outfile, indent=4, sort_keys=True)
print(f"results saved to: {output_file_path}")
scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
with open(scaled_file_path, "w") as outfile:
json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
print(f"results saved to: {scaled_file_path}")

View File

@ -0,0 +1,71 @@
import argparse
import os
import time
import torch
from vbench import VBench
from vbench2_beta_i2v import VBenchI2V
full_info_path = "eval/vbench_i2v/vbench2_i2v_full_info.json"
video_quality_dimensions = [
"subject_consistency",
"background_consistency",
"motion_smoothness",
"dynamic_degree",
"aesthetic_quality",
"imaging_quality",
"temporal_flickering",
]
i2v_dimensions = ["i2v_subject", "i2v_background", "camera_motion"]
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Boolean value expected.")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("video_folder", type=str) # samples/samples..._vbench_i2v/
parser.add_argument("model_ckpt", type=str)
parser.add_argument("--start", type=int, default=0) # start index of dimension to be evaluated
parser.add_argument("--end", type=int, default=-1) # start index of dimension to be evaluated
parser.add_argument("--calc_i2v", type=str2bool, default=True)
parser.add_argument("--calc_quality", type=str2bool, default=True)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
output_dir = os.path.join(args.model_ckpt, "vbench_i2v")
os.makedirs(output_dir, exist_ok=True)
video_path = args.video_folder
start_time = time.time()
if args.calc_i2v:
my_VBench_I2V = VBenchI2V(torch.device("cuda"), full_info_path, output_dir)
end = args.end if args.end != -1 else len(i2v_dimensions)
for i2v_dim in i2v_dimensions[args.start : end]:
my_VBench_I2V.evaluate(videos_path=video_path, name=i2v_dim, dimension_list=[i2v_dim], resolution="1-1")
kwargs = {}
kwargs["imaging_quality_preprocessing_mode"] = "longer" # use VBench/evaluate.py default
if args.calc_quality:
my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
end = args.end if args.end != -1 else len(video_quality_dimensions)
for quality_dim in video_quality_dimensions[args.start : end]:
my_VBench.evaluate(
videos_path=video_path, name=quality_dim, dimension_list=[quality_dim], mode="vbench_standard", **kwargs
)
print("Runtime: %s seconds " % (time.time() - start_time))

View File

@ -0,0 +1,17 @@
import json
import os
RESOLUTIONS = ["1-1", "16-9", "7-4", "8-5"]
cache_root = "/mnt/jfs-hdd/sora/data/vbench-i2v/crop"
resolution = RESOLUTIONS[0]
json_file = "vbench2_i2v_full_info.json"
save_path = "all_i2v.txt"
data = json.load(open(json_file))
txt = [
f'{x["prompt_en"]}{{"reference_path": "{os.path.join(cache_root, resolution, x["image_name"])}", "mask_strategy": "0"}}'
for x in data
]
with open(save_path, "w") as f:
f.write("\n".join(txt))

50
eval/vbench_i2v/launch.sh Normal file
View File

@ -0,0 +1,50 @@
#!/bin/bash
CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
RES=$4
ASP_RATIO=$5
NUM_SAMPLING_STEPS=$6
FLOW=$7
LLM_REFINE=$8
if [[ $CKPT == *"ema"* ]]; then
parentdir=$(dirname $CKPT)
CKPT_BASE=$(basename $parentdir)_ema
else
CKPT_BASE=$(basename $CKPT)
fi
LOG_BASE=$(dirname $CKPT)/eval
echo "Logging to $LOG_BASE"
GPUS=(0 1 2 3 4 5 6 7)
TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
START_INDEX_LIST=(0 140 280 420 560 700 840 980)
END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
for i in "${!GPUS[@]}"; do
if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ;
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${NUM_SAMPLING_STEPS} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${FLOW} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${LLM_REFINE} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
fi
fi
fi
fi
done

View File

@ -0,0 +1,19 @@
# !/bin/bash
VIDEO_DIR=$1
CKPT_DIR=$2
LOG_BASE=$CKPT_DIR
mkdir -p $LOG_BASE
echo "Logging to $LOG_BASE"
GPUS=(0 1 2 3 4 5 6 7)
CALC_I2V_LIST=(True True False False False False False False)
CALC_QUALITY_LIST=(False False True True True True True True)
START_INDEX_LIST=(0 2 0 2 3 4 5 6)
END_INDEX_LIST=(2 -1 2 3 4 5 6 -1)
TASK_ID_LIST=(calc_vbench_i2v_a calc_vbench_i2v_b calc_vbench_i2v_c calc_vbench_i2v_d calc_vbench_i2v_e calc_vbench_i2v_f calc_vbench_i2v_g calc_vbench_i2v_h) # for log records only
for i in "${!GPUS[@]}"; do
CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench_i2v/calc_vbench_i2v.py $VIDEO_DIR $CKPT_DIR --calc_i2v ${CALC_I2V_LIST[i]} --calc_quality ${CALC_QUALITY_LIST[i]} --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
done

View File

@ -0,0 +1,132 @@
import argparse
import json
import os
I2V_WEIGHT = 1.0
I2V_QUALITY_WEIGHT = 1.0
I2V_LIST = [
"i2v_subject",
"i2v_background",
]
I2V_QUALITY_LIST = [
"subject_consistency",
"background_consistency",
"temporal_flickering",
"motion_smoothness",
"aesthetic_quality",
"imaging_quality",
"dynamic_degree",
]
DIM_WEIGHT_I2V = {
"camera_motion": 0.1,
"i2v_subject": 1,
"i2v_background": 1,
"subject_consistency": 1,
"background_consistency": 1,
"motion_smoothness": 1,
"dynamic_degree": 0.5,
"aesthetic_quality": 1,
"imaging_quality": 1,
"temporal_flickering": 1,
}
NORMALIZE_DIC_I2V = {
"camera_motion": {"Min": 0.0, "Max": 1.0},
"i2v_subject": {"Min": 0.1462, "Max": 1.0},
"i2v_background": {"Min": 0.2615, "Max": 1.0},
"subject_consistency": {"Min": 0.1462, "Max": 1.0},
"background_consistency": {"Min": 0.2615, "Max": 1.0},
"motion_smoothness": {"Min": 0.7060, "Max": 0.9975},
"dynamic_degree": {"Min": 0.0, "Max": 1.0},
"aesthetic_quality": {"Min": 0.0, "Max": 1.0},
"imaging_quality": {"Min": 0.0, "Max": 1.0},
"temporal_flickering": {"Min": 0.6293, "Max": 1.0},
}
ordered_scaled_res = [
"total score",
"i2v score",
"quality score",
"camera_motion",
"i2v_subject",
"i2v_background",
"subject_consistency",
"background_consistency",
"motion_smoothness",
"dynamic_degree",
"aesthetic_quality",
"imaging_quality",
"temporal_flickering",
]
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--score_dir", type=str) # ckpt_dir/eval/vbench_i2v
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
res_postfix = "_eval_results.json"
info_postfix = "_full_info.json"
files = os.listdir(args.score_dir)
res_files = [x for x in files if res_postfix in x]
info_files = [x for x in files if info_postfix in x]
assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"
full_results = {}
for res_file in res_files:
# first check if results is normal
info_file = res_file.split(res_postfix)[0] + info_postfix
with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
info = json.load(f)
assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
# read results
with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
data = json.load(f)
for key, val in data.items():
full_results[key] = format(val[0], ".4f")
scaled_results = {}
dims = set()
for key, val in full_results.items():
dim = key
scaled_score = (float(val) - NORMALIZE_DIC_I2V[dim]["Min"]) / (
NORMALIZE_DIC_I2V[dim]["Max"] - NORMALIZE_DIC_I2V[dim]["Min"]
)
scaled_score *= DIM_WEIGHT_I2V[dim]
scaled_results[dim] = scaled_score
dims.add(dim)
assert len(dims) == len(NORMALIZE_DIC_I2V), f"{set(NORMALIZE_DIC_I2V.keys())-dims} not calculated yet"
quality_score = sum([scaled_results[i] for i in I2V_QUALITY_LIST]) / sum(
[DIM_WEIGHT_I2V[i] for i in I2V_QUALITY_LIST]
)
i2v_score = sum([scaled_results[i] for i in I2V_LIST]) / sum([DIM_WEIGHT_I2V[i] for i in I2V_LIST])
scaled_results["quality score"] = quality_score
scaled_results["i2v score"] = i2v_score
scaled_results["total score"] = (quality_score * I2V_QUALITY_WEIGHT + i2v_score * I2V_WEIGHT) / (
I2V_QUALITY_WEIGHT + I2V_WEIGHT
)
formated_scaled_results = {"item": []}
for key in ordered_scaled_res:
formated_res = format(scaled_results[key] * 100, ".2f") + "%"
formated_scaled_results["item"].append({key: formated_res})
output_file_path = os.path.join(args.score_dir, "all_results.json")
with open(output_file_path, "w") as outfile:
json.dump(full_results, outfile, indent=4, sort_keys=True)
print(f"results saved to: {output_file_path}")
scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
with open(scaled_file_path, "w") as outfile:
json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
print(f"results saved to: {scaled_file_path}")

File diff suppressed because it is too large Load Diff

758
gradio/app.py Normal file
View File

@ -0,0 +1,758 @@
#!/usr/bin/env python
"""
This script runs a Gradio App for the Open-Sora model.
Usage:
python demo.py <config-path>
"""
import argparse
import datetime
import importlib
import os
import subprocess
import sys
from tempfile import NamedTemporaryFile
import spaces
import torch
import gradio as gr
MODEL_TYPES = ["v1.3"]
WATERMARK_PATH = "./assets/images/watermark/watermark.png"
CONFIG_MAP = {
"v1.3": "configs/opensora-v1-3/inference/t2v.py",
"v1.3_i2v": "configs/opensora-v1-3/inference/v2v.py",
}
HF_STDIT_MAP = {
"t2v": {
"360p": "hpcaitech/OpenSora-STDiT-v4-360p",
"720p": "hpcaitech/OpenSora-STDiT-v4",
},
"i2v": "hpcaitech/OpenSora-STDiT-v4-i2v",
}
# ============================
# Prepare Runtime Environment
# ============================
def install_dependencies(enable_optimization=False):
"""
Install the required dependencies for the demo if they are not already installed.
"""
def _is_package_available(name) -> bool:
try:
importlib.import_module(name)
return True
except (ImportError, ModuleNotFoundError):
return False
if enable_optimization:
# install flash attention
if not _is_package_available("flash_attn"):
subprocess.run(
f"{sys.executable} -m pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
# install apex for fused layernorm
if not _is_package_available("apex"):
subprocess.run(
f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git',
shell=True,
)
# install ninja
if not _is_package_available("ninja"):
subprocess.run(f"{sys.executable} -m pip install ninja", shell=True)
# install xformers
if not _is_package_available("xformers"):
subprocess.run(
f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers",
shell=True,
)
# ============================
# Model-related
# ============================
def read_config(config_path):
"""
Read the configuration file.
"""
from mmengine.config import Config
return Config.fromfile(config_path)
def build_models(mode, resolution, enable_optimization=False):
"""
Build the models for the given mode, resolution, and configuration.
"""
# build vae
from opensora.registry import MODELS, build_module
if mode == "i2v":
config = read_config(CONFIG_MAP["v1.3_i2v"])
else:
config = read_config(CONFIG_MAP["v1.3"])
vae = build_module(config.vae, MODELS).cuda()
# build text encoder
text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32
text_encoder.t5.model = text_encoder.t5.model.cuda()
# Determine model weights based on mode and resolution
if mode == "i2v":
weight_path = HF_STDIT_MAP["i2v"]
else: # t2v
weight_path = HF_STDIT_MAP["t2v"].get(resolution, None)
if not weight_path:
raise ValueError(f"Unsupported resolution {resolution} for mode {mode}")
# build stdit
from opensora.models.stdit.stdit3 import STDiT3
model_kwargs = {k: v for k, v in config.model.items() if k not in ("type", "from_pretrained", "force_huggingface")}
print("Load STDIT3 from ", weight_path)
stdit = STDiT3.from_pretrained(weight_path, **model_kwargs).cuda()
# build scheduler
from opensora.registry import SCHEDULERS
scheduler = build_module(config.scheduler, SCHEDULERS)
# hack for classifier-free guidance
text_encoder.y_embedder = stdit.y_embedder
# move models to device
vae = vae.to(torch.bfloat16).eval()
text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32
stdit = stdit.to(torch.bfloat16).eval()
# clear cuda
torch.cuda.empty_cache()
return vae, text_encoder, stdit, scheduler, config
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model-type",
default="v1.3",
choices=MODEL_TYPES,
help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
)
parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
parser.add_argument("--host", default="0.0.0.0", type=str, help="The host to run the Gradio App on.")
parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
parser.add_argument(
"--enable-optimization",
action="store_true",
help="Whether to enable optimization such as flash attention and fused layernorm",
)
return parser.parse_args()
# ============================
# Main Gradio Script
# ============================
# as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text
# so we can't pass the models to `run_inference` as arguments.
# instead, we need to define them globally so that we can access these models inside `run_inference`
# read config
args = parse_args()
config = read_config(CONFIG_MAP[args.model_type])
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# make outputs dir
os.makedirs(args.output, exist_ok=True)
# disable torch jit as it can cause failure in gradio SDK
# gradio sdk uses torch with cuda 11.3
torch.jit._state.disable()
# set up
install_dependencies(enable_optimization=args.enable_optimization)
# import after installation
from opensora.datasets import IMG_FPS, save_sample
from opensora.datasets.aspect import get_image_size, get_num_frames
from opensora.models.text_encoder.t5 import text_preprocessing
from opensora.utils.inference_utils import (
add_watermark,
append_generated,
append_score_to_prompts,
apply_mask_strategy,
collect_references_batch,
dframe_to_frame,
extract_json_from_prompts,
extract_prompts_loop,
get_random_prompt_by_openai,
has_openai_key,
merge_prompt,
prepare_multi_resolution_info,
refine_prompts_by_openai,
split_prompt,
prep_ref_and_update_mask_in_loop,
prep_ref_and_mask
)
from opensora.utils.misc import to_torch_dtype
# some global variables
dtype = to_torch_dtype(config.dtype)
device = torch.device("cuda")
# build model
def initialize_models(mode, resolution):
return build_models(mode, resolution, enable_optimization=args.enable_optimization)
def run_inference(
mode,
prompt_text,
resolution,
aspect_ratio,
length,
motion_strength,
aesthetic_score,
use_motion_strength,
use_aesthetic_score,
camera_motion,
reference_image,
refine_prompt,
fps,
num_loop,
seed,
sampling_steps,
cfg_scale,
):
if prompt_text is None or prompt_text == "":
gr.Warning("Your prompt is empty, please enter a valid prompt")
return None
# Dynamically choose mode based on reference image
if reference_image is not None and mode != "Text2Image":
mode = "i2v"
# Initialize models
vae, text_encoder, stdit, scheduler, config = initialize_models(mode, resolution)
torch.manual_seed(seed)
with torch.inference_mode():
# ======================
# 1. Preparation arguments
# ======================
# parse the inputs
# frame_interval must be 1 so we ignore it here
image_size = get_image_size(resolution, aspect_ratio)
use_sdedit = config.get("use_sdedit", False)
use_oscillation_guidance_for_text = config.get("use_oscillation_guidance_for_text", None)
use_oscillation_guidance_for_image = config.get("use_oscillation_guidance_for_image", None)
cond_type = config.get("cond_type", None)
cond_type = None if cond_type == "none" else cond_type
mask_index = None
ref = None
image_cfg_scale = None
# compute generation parameters
if mode == "Text2Image":
num_frames = 1
fps = IMG_FPS
else:
num_frames = config.num_frames
num_frames = get_num_frames(length)
condition_frame_length = config.get("condition_frame_length", 5)
condition_frame_edit = config.get("condition_frame_edit", 0.0)
input_size = (num_frames, *image_size)
latent_size = vae.get_latent_size(input_size)
multi_resolution = "OpenSora"
align = 5
# == prepare mask strategy ==
if mode == "Text2Image":
mask_strategy = [None]
mask_index = []
elif mode == "Text2Video":
if reference_image is not None:
mask_strategy = ["0"]
mask_index = [0]
else:
mask_strategy = [None]
mask_index = []
elif mode == "i2v":
mask_strategy = ["0"]
mask_index = [0]
else:
raise ValueError(f"Invalid mode: {mode}")
# == prepare reference ==
if mode == "Text2Image":
refs = [""]
elif mode == "Text2Video":
if reference_image is not None:
# save image to disk
from PIL import Image
im = Image.fromarray(reference_image)
temp_file = NamedTemporaryFile(suffix=".png")
im.save(temp_file.name)
refs = [temp_file.name]
else:
refs = [""]
elif mode == "i2v":
if reference_image is not None:
# save image to disk
from PIL import Image
im = Image.fromarray(reference_image)
temp_file = NamedTemporaryFile(suffix=".png")
im.save(temp_file.name)
refs = [temp_file.name]
else:
refs = [""]
else:
raise ValueError(f"Invalid mode: {mode}")
# == get json from prompts ==
batch_prompts = [prompt_text]
batch_prompts, refs, mask_strategy = extract_json_from_prompts(batch_prompts, refs, mask_strategy)
# == get reference for condition ==
refs = collect_references_batch(refs, vae, image_size)
target_shape = [len(batch_prompts), vae.out_channels, *latent_size]
if mode == "i2v":
image_cfg_scale = config.get("image_cfg_scale", 7.5)
ref, mask_index = prep_ref_and_mask(
cond_type, condition_frame_length, refs, target_shape, num_loop, device, dtype
)
# == multi-resolution info ==
model_args = prepare_multi_resolution_info(
multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype
)
# == process prompts step by step ==
# 0. split prompt
# each element in the list is [prompt_segment_list, loop_idx_list]
batched_prompt_segment_list = []
batched_loop_idx_list = []
for prompt in batch_prompts:
prompt_segment_list, loop_idx_list = split_prompt(prompt)
batched_prompt_segment_list.append(prompt_segment_list)
batched_loop_idx_list.append(loop_idx_list)
# 1. refine prompt by openai
if refine_prompt:
# check if openai key is provided
if not has_openai_key():
gr.Warning("OpenAI API key is not provided, the prompt will not be enhanced.")
else:
for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list)
# process scores
aesthetic_score = aesthetic_score if use_aesthetic_score else None
motion_strength = motion_strength if use_motion_strength and mode != "Text2Image" else None
camera_motion = None if camera_motion == "none" or mode == "Text2Image" else camera_motion
# 2. append score
for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
batched_prompt_segment_list[idx] = append_score_to_prompts(
prompt_segment_list,
aes=aesthetic_score,
flow=motion_strength,
camera_motion=camera_motion,
)
# 3. clean prompt with T5
for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list]
# 4. merge to obtain the final prompt
batch_prompts = []
for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list):
batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list))
# =========================
# Generate image/video
# =========================
video_clips = []
for loop_i in range(num_loop):
# 4.4 sample in hidden space
batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)
# == loop ==
# if loop_i > 0:
# refs, mask_strategy = append_generated(
# vae, video_clips[-1], refs, mask_strategy, loop_i, condition_frame_length, condition_frame_edit
# )
# == sampling ==
z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
masks = apply_mask_strategy(z, refs, mask_strategy, loop_i, align=align) if mask_index is None else None
x_cond_mask = torch.zeros(len(batch_prompts), vae.out_channels, *latent_size, device=device).to(dtype) if mask_index is not None else None
if x_cond_mask is not None and mask_index is not None:
x_cond_mask[:, :, mask_index, :, :] = 1.0
# 4.6. diffusion sampling
# hack to update num_sampling_steps and cfg_scale
scheduler_kwargs = config.scheduler.copy()
scheduler_kwargs.pop("type")
scheduler_kwargs["num_sampling_steps"] = sampling_steps
scheduler_kwargs["cfg_scale"] = cfg_scale
scheduler.__init__(**scheduler_kwargs)
samples = scheduler.sample(
stdit,
text_encoder,
z=z,
z_cond=ref,
z_cond_mask=x_cond_mask,
prompts=batch_prompts_loop,
device=device,
additional_args=model_args,
progress=True,
mask=masks,
mask_index=mask_index,
image_cfg_scale=image_cfg_scale,
use_sdedit=use_sdedit,
use_oscillation_guidance_for_text=use_oscillation_guidance_for_text,
use_oscillation_guidance_for_image=use_oscillation_guidance_for_image,
)
if loop_i > 1: # process conditions for subsequent loop
if cond_type is not None: # i2v or v2v
is_last_loop = loop_i == loop_i - 1
ref, mask_index = prep_ref_and_update_mask_in_loop(
cond_type,
condition_frame_length,
samples,
refs,
target_shape,
is_last_loop,
device,
dtype,
)
else:
refs, mask_strategy = append_generated(
vae,
samples,
refs,
mask_strategy,
loop_i,
condition_frame_length,
condition_frame_edit,
is_latent=True,
)
# samples = vae.decode(samples.to(dtype), num_frames=num_frames)
video_clips.append(samples)
# =========================
# Save output
# =========================
video_clips = [val[0] for val in video_clips]
for i in range(1, num_loop):
video_clips[i] = video_clips[i][:, condition_frame_length:]
video = torch.cat(video_clips, dim=1)
t_cut = max(video.size(1) // 5 * 5, 1)
if t_cut < video.size(1):
video = video[:, :t_cut]
video = vae.decode(video.to(dtype), num_frames=t_cut * 17 // 5).squeeze(0)
current_datetime = datetime.datetime.now()
timestamp = current_datetime.timestamp()
save_path = os.path.join(args.output, f"output_{timestamp}")
saved_path = save_sample(video, save_path=save_path, fps=24)
torch.cuda.empty_cache()
# add watermark
if mode != "Text2Image" and os.path.exists(WATERMARK_PATH):
watermarked_path = saved_path.replace(".mp4", "_watermarked.mp4")
success = add_watermark(saved_path, WATERMARK_PATH, watermarked_path)
if success:
return watermarked_path
else:
return saved_path
else:
return saved_path
@spaces.GPU(duration=200)
def run_image_inference(
prompt_text,
resolution,
aspect_ratio,
length,
motion_strength,
aesthetic_score,
use_motion_strength,
use_aesthetic_score,
camera_motion,
reference_image,
refine_prompt,
fps,
num_loop,
seed,
sampling_steps,
cfg_scale,
):
return run_inference(
"Text2Image",
prompt_text,
resolution,
aspect_ratio,
length,
motion_strength,
aesthetic_score,
use_motion_strength,
use_aesthetic_score,
camera_motion,
reference_image,
refine_prompt,
fps,
num_loop,
seed,
sampling_steps,
cfg_scale,
)
@spaces.GPU(duration=200)
def run_video_inference(
prompt_text,
resolution,
aspect_ratio,
length,
motion_strength,
aesthetic_score,
use_motion_strength,
use_aesthetic_score,
camera_motion,
reference_image,
refine_prompt,
fps,
num_loop,
seed,
sampling_steps,
cfg_scale,
):
# if (resolution == "480p" and length == "16s") or \
# (resolution == "720p" and length in ["8s", "16s"]):
# gr.Warning("Generation is interrupted as the combination of 480p and 16s will lead to CUDA out of memory")
# else:
return run_inference(
"Text2Video",
prompt_text,
resolution,
aspect_ratio,
length,
motion_strength,
aesthetic_score,
use_motion_strength,
use_aesthetic_score,
camera_motion,
reference_image,
refine_prompt,
fps,
num_loop,
seed,
sampling_steps,
cfg_scale,
)
def generate_random_prompt():
if "OPENAI_API_KEY" not in os.environ:
gr.Warning("Your prompt is empty and the OpenAI API key is not provided, please enter a valid prompt")
return None
else:
prompt_text = get_random_prompt_by_openai()
return prompt_text
def main():
# create demo
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
gr.HTML(
"""
<div style='text-align: center;'>
<p align="center">
<img src="https://github.com/hpcaitech/Open-Sora-Demo/blob/main/readme/icon.png" width="250"/>
</p>
<div style="display: flex; gap: 10px; justify-content: center;">
<a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
<a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
<a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
<a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
<a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
<a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
<a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
</div>
<h1 style='margin-top: 5px;'>Open-Sora: Democratizing Efficient Video Production for All</h1>
</div>
"""
)
with gr.Row():
with gr.Column():
prompt_text = gr.Textbox(label="Prompt", placeholder="Describe your video here", lines=4)
refine_prompt = gr.Checkbox(
value=has_openai_key(), label="Refine prompt with GPT4o", interactive=has_openai_key()
)
random_prompt_btn = gr.Button("Random Prompt By GPT4o", interactive=has_openai_key())
gr.Markdown("## Basic Settings")
resolution = gr.Radio(
choices=["360p", "720p"],
value="720p",
label="Resolution",
)
aspect_ratio = gr.Radio(
choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
value="9:16",
label="Aspect Ratio (H:W)",
)
length = gr.Radio(
choices=[1, 49, 65, 81, 97, 113],
value=97,
label="Video Length (Number of Frames)",
info="Setting the number of frames to 1 indicates image generation instead of video generation.",
)
with gr.Row():
seed = gr.Slider(value=1024, minimum=1, maximum=2048, step=1, label="Seed")
sampling_steps = gr.Slider(value=30, minimum=1, maximum=200, step=1, label="Sampling steps")
cfg_scale = gr.Slider(value=7.0, minimum=0.0, maximum=10.0, step=0.1, label="CFG Scale")
with gr.Row():
with gr.Column():
motion_strength = gr.Radio(
choices=["very low", "low", "fair", "high", "very high", "extremely high"],
value="fair",
label="Motion Strength",
info="Only effective for video generation",
)
use_motion_strength = gr.Checkbox(value=True, label="Enable")
with gr.Column():
aesthetic_score = gr.Radio(
choices=["terrible", "very poor", "poor", "fair", "good", "very good", "excellent"],
value="excellent",
label="Aesthetic",
info="Effective for text & video generation",
)
use_aesthetic_score = gr.Checkbox(value=True, label="Enable")
camera_motion = gr.Radio(
value="none",
label="Camera Motion",
choices=["none", "pan right", "pan left", "tilt up", "tilt down", "zoom in", "zoom out", "static"],
interactive=True,
)
gr.Markdown("## Advanced Settings")
with gr.Row():
fps = gr.Slider(
value=24,
minimum=1,
maximum=60,
step=1,
label="FPS",
info="This is the frames per seconds for video generation, keep it to 24 if you are not sure",
)
num_loop = gr.Slider(
value=1,
minimum=1,
maximum=20,
step=1,
label="Number of Loops",
info="This will change the length of the generated video, keep it to 1 if you are not sure",
)
gr.Markdown("## Reference Image")
reference_image = gr.Image(label="Image (optional)", show_download_button=True)
with gr.Column():
output_video = gr.Video(label="Output Video", height="100%")
with gr.Row():
image_gen_button = gr.Button("Generate image")
video_gen_button = gr.Button("Generate video")
image_gen_button.click(
fn=run_image_inference,
inputs=[
prompt_text,
resolution,
aspect_ratio,
length,
motion_strength,
aesthetic_score,
use_motion_strength,
use_aesthetic_score,
camera_motion,
reference_image,
refine_prompt,
fps,
num_loop,
seed,
sampling_steps,
cfg_scale,
],
outputs=reference_image,
)
video_gen_button.click(
fn=run_video_inference,
inputs=[
prompt_text,
resolution,
aspect_ratio,
length,
motion_strength,
aesthetic_score,
use_motion_strength,
use_aesthetic_score,
camera_motion,
reference_image,
refine_prompt,
fps,
num_loop,
seed,
sampling_steps,
cfg_scale,
],
outputs=output_video,
)
random_prompt_btn.click(fn=generate_random_prompt, outputs=prompt_text)
# launch
demo.queue(max_size=5, default_concurrency_limit=1)
demo.launch(server_port=args.port, server_name=args.host, share=args.share, max_threads=1)
if __name__ == "__main__":
main()

0
opensora/__init__.py Normal file
View File

View File

View File

@ -0,0 +1,271 @@
import warnings
from collections.abc import Iterable
from typing import Callable, ContextManager, Optional, Tuple
import torch
import torch.nn as nn
from colossalai.utils import get_current_device
from torch.utils.checkpoint import (
_DEFAULT_DETERMINISM_MODE,
CheckpointFunction,
_checkpoint_without_reentrant_generator,
checkpoint_sequential,
noop_context_fn,
)
class ActivationManager:
def __init__(self):
self.enable = False
self.buffer = None
self.total_size = 0
self.avail_offset = 0
self.tensor_id_queue = []
self.ignore_tensor_id_set = set()
def setup_buffer(self, numel: int, dtype: torch.dtype):
self.buffer = torch.empty(numel, dtype=dtype, pin_memory=True)
self.total_size = numel
self.enable = True
def offload(self, x: torch.Tensor) -> None:
if not self.enable or id(x) in self.ignore_tensor_id_set:
return
size = x.numel()
if self.avail_offset + size > self.total_size:
raise RuntimeError("Activation buffer is full")
assert x.dtype == self.buffer.dtype, f"Wrong dtype of offload tensor"
cpu_x = self.buffer[self.avail_offset : self.avail_offset + size].view_as(x)
cpu_x.copy_(x)
x.data = cpu_x
self.avail_offset += size
self.tensor_id_queue.append(id(x))
def onload(self, x: torch.Tensor) -> None:
if not self.enable or id(x) in self.ignore_tensor_id_set:
return
assert self.tensor_id_queue[-1] == id(x), f"Wrong order of offload/onload"
# current x is pinned memory
assert x.data.is_pinned()
x.data = x.data.to(get_current_device(), non_blocking=True)
self.tensor_id_queue.pop()
self.avail_offset -= x.numel()
if len(self.tensor_id_queue) == 0:
self.ignore_tensor_id_set.clear()
def add_ignore_tensor(self, x: torch.Tensor) -> None:
self.ignore_tensor_id_set.add(id(x))
def is_top_tensor(self, x: torch.Tensor) -> bool:
return len(self.tensor_id_queue) > 0 and self.tensor_id_queue[-1] == id(x)
GLOBAL_ACTIVATION_MANAGER = ActivationManager()
class CheckpointFunctionWithOffload(torch.autograd.Function):
@staticmethod
def forward(ctx, run_function, preserve_rng_state, *args):
for x in args[::-1]:
# handle those tensors are used in multiple checkpoints
if GLOBAL_ACTIVATION_MANAGER.is_top_tensor(x):
GLOBAL_ACTIVATION_MANAGER.onload(x)
GLOBAL_ACTIVATION_MANAGER.add_ignore_tensor(x)
out = CheckpointFunction.forward(ctx, run_function, preserve_rng_state, *args)
for x in args:
if torch.is_tensor(x):
GLOBAL_ACTIVATION_MANAGER.offload(x)
return out
@staticmethod
def backward(ctx, *args):
# with stack-fashion, the last tensor is the first to be loaded
for tensor in ctx.saved_tensors[::-1]:
GLOBAL_ACTIVATION_MANAGER.onload(tensor)
return CheckpointFunction.backward(ctx, *args)
# TorchDynamo does not step inside utils.checkpoint function. The flow
# looks likes this
# 1) TorchDynamo tries to wrap utils.checkpoint in a HigherOrderOp by
# speculatively checking if the forward function is safe to trace.
# 2) If yes, then Dynamo-generated Fx graph has the wrapped higher
# order op. As a result, TorchDynamo does not look inside utils.checkpoint.
# 3) If not, then TorchDynamo falls back to eager by performing a graph
# break. And here, the following disable wrapper ensures that
# TorchDynamo does not trigger again on the frames created by
# utils.checkpoint innards.
@torch._disable_dynamo
def checkpoint(
function,
*args,
use_reentrant: Optional[bool] = None,
context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
determinism_check: str = _DEFAULT_DETERMINISM_MODE,
debug: bool = False,
**kwargs,
):
r"""Checkpoint a model or part of the model.
Activation checkpointing is a technique that trades compute for memory.
Instead of keeping tensors needed for backward alive until they are used in
gradient computation during backward, forward computation in checkpointed
regions omits saving tensors for backward and recomputes them during the
backward pass. Activation checkpointing can be applied to any part of a
model.
There are currently two checkpointing implementations available, determined
by the :attr:`use_reentrant` parameter. It is recommended that you use
``use_reentrant=False``. Please refer the note below for a discussion of
their differences.
.. warning::
If the :attr:`function` invocation during the backward pass differs
from the forward pass, e.g., due to a global variable, the checkpointed
version may not be equivalent, potentially causing an
error being raised or leading to silently incorrect gradients.
.. warning::
The ``use_reentrant`` parameter should be passed explicitly. In version
2.4 we will raise an exception if ``use_reentrant`` is not passed.
If you are using the ``use_reentrant=True`` variant, please refer to the
note below for important considerations and potential limitations.
.. note::
The reentrant variant of checkpoint (``use_reentrant=True``) and
the non-reentrant variant of checkpoint (``use_reentrant=False``)
differ in the following ways:
* Non-reentrant checkpoint stops recomputation as soon as all needed
intermediate activations have been recomputed. This feature is enabled
by default, but can be disabled with :func:`set_checkpoint_early_stop`.
Reentrant checkpoint always recomputes :attr:`function` in its
entirety during the backward pass.
* The reentrant variant does not record the autograd graph during the
forward pass, as it runs with the forward pass under
:func:`torch.no_grad`. The non-reentrant version does record the
autograd graph, allowing one to perform backward on the graph within
checkpointed regions.
* The reentrant checkpoint only supports the
:func:`torch.autograd.backward` API for the backward pass without its
`inputs` argument, while the non-reentrant version supports all ways
of performing the backward pass.
* At least one input and output must have ``requires_grad=True`` for the
reentrant variant. If this condition is unmet, the checkpointed part
of the model will not have gradients. The non-reentrant version does
not have this requirement.
* The reentrant version does not consider tensors in nested structures
(e.g., custom objects, lists, dicts, etc) as participating in
autograd, while the non-reentrant version does.
* The reentrant checkpoint does not support checkpointed regions with
detached tensors from the computational graph, whereas the
non-reentrant version does. For the reentrant variant, if the
checkpointed segment contains tensors detached using ``detach()`` or
with :func:`torch.no_grad`, the backward pass will raise an error.
This is because ``checkpoint`` makes all the outputs require gradients
and this causes issues when a tensor is defined to have no gradient in
the model. To avoid this, detach the tensors outside of the
``checkpoint`` function.
Args:
function: describes what to run in the forward pass of the model or
part of the model. It should also know how to handle the inputs
passed as the tuple. For example, in LSTM, if user passes
``(activation, hidden)``, :attr:`function` should correctly use the
first input as ``activation`` and the second input as ``hidden``
preserve_rng_state(bool, optional): Omit stashing and restoring
the RNG state during each checkpoint. Note that under torch.compile,
this flag doesn't take effect and we always preserve RNG state.
Default: ``True``
use_reentrant(bool):
specify whether to use the activation checkpoint variant that
requires reentrant autograd. This parameter should be passed
explicitly. In version 2.4 we will raise an exception if
``use_reentrant`` is not passed. If ``use_reentrant=False``,
``checkpoint`` will use an implementation that does not require
reentrant autograd. This allows ``checkpoint`` to support additional
functionality, such as working as expected with
``torch.autograd.grad`` and support for keyword arguments input into
the checkpointed function.
context_fn(Callable, optional): A callable returning a tuple of two
context managers. The function and its recomputation will be run
under the first and second context managers respectively.
This argument is only supported if ``use_reentrant=False``.
determinism_check(str, optional): A string specifying the determinism
check to perform. By default it is set to ``"default"`` which
compares the shapes, dtypes, and devices of the recomputed tensors
against those the saved tensors. To turn off this check, specify
``"none"``. Currently these are the only two supported values.
Please open an issue if you would like to see more determinism
checks. This argument is only supported if ``use_reentrant=False``,
if ``use_reentrant=True``, the determinism check is always disabled.
debug(bool, optional): If ``True``, error messages will also include
a trace of the operators ran during the original forward computation
as well as the recomputation. This argument is only supported if
``use_reentrant=False``.
args: tuple containing inputs to the :attr:`function`
Returns:
Output of running :attr:`function` on :attr:`*args`
"""
if use_reentrant is None:
warnings.warn(
"torch.utils.checkpoint: the use_reentrant parameter should be "
"passed explicitly. In version 2.4 we will raise an exception "
"if use_reentrant is not passed. use_reentrant=False is "
"recommended, but if you need to preserve the current default "
"behavior, you can pass use_reentrant=True. Refer to docs for more "
"details on the differences between the two variants.",
stacklevel=2,
)
use_reentrant = True
# Hack to mix *args with **kwargs in a python 2.7-compliant way
preserve = kwargs.pop("preserve_rng_state", True)
if kwargs and use_reentrant:
raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs))
if use_reentrant:
if context_fn is not noop_context_fn or debug is not False:
raise ValueError("Passing `context_fn` or `debug` is only supported when " "use_reentrant=False.")
return CheckpointFunctionWithOffload.apply(function, preserve, *args)
else:
gen = _checkpoint_without_reentrant_generator(
function, preserve, context_fn, determinism_check, debug, *args, **kwargs
)
# Runs pre-forward logic
next(gen)
ret = function(*args, **kwargs)
# Runs post-forward logic
try:
next(gen)
except StopIteration:
return ret
def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
assert isinstance(model, nn.Module)
def set_attr(module):
module.grad_checkpointing = True
module.fp32_attention = use_fp32_attention
module.grad_checkpointing_step = gc_step
model.apply(set_attr)
def auto_grad_checkpoint(module, *args, **kwargs):
if getattr(module, "grad_checkpointing", False):
if not isinstance(module, Iterable):
return checkpoint(module, *args, use_reentrant=True, **kwargs)
gc_step = module[0].grad_checkpointing_step
return checkpoint_sequential(module, gc_step, *args, use_reentrant=False, **kwargs)
return module(*args, **kwargs)

View File

@ -0,0 +1,188 @@
import torch
import torch.distributed as dist
# ====================
# All-To-All
# ====================
def _all_to_all(
input_: torch.Tensor,
world_size: int,
group: dist.ProcessGroup,
scatter_dim: int,
gather_dim: int,
):
input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
dist.all_to_all(output_list, input_list, group=group)
return torch.cat(output_list, dim=gather_dim).contiguous()
class _AllToAll(torch.autograd.Function):
"""All-to-all communication.
Args:
input_: input matrix
process_group: communication group
scatter_dim: scatter dimension
gather_dim: gather dimension
"""
@staticmethod
def forward(ctx, input_, process_group, scatter_dim, gather_dim):
ctx.process_group = process_group
ctx.scatter_dim = scatter_dim
ctx.gather_dim = gather_dim
ctx.world_size = dist.get_world_size(process_group)
output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
return output
@staticmethod
def backward(ctx, grad_output):
grad_output = _all_to_all(
grad_output,
ctx.world_size,
ctx.process_group,
ctx.gather_dim,
ctx.scatter_dim,
)
return (
grad_output,
None,
None,
None,
)
def all_to_all(
input_: torch.Tensor,
process_group: dist.ProcessGroup,
scatter_dim: int = 2,
gather_dim: int = 1,
):
return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
def _gather(
input_: torch.Tensor,
world_size: int,
group: dist.ProcessGroup,
gather_dim: int,
):
if gather_list is None:
gather_list = [torch.empty_like(input_) for _ in range(world_size)]
dist.gather(input_, gather_list, group=group, gather_dim=gather_dim)
return gather_list
# ====================
# Gather-Split
# ====================
def _split(input_, pg: dist.ProcessGroup, dim=-1):
# skip if only one rank involved
world_size = dist.get_world_size(pg)
rank = dist.get_rank(pg)
if world_size == 1:
return input_
# Split along last dimension.
dim_size = input_.size(dim)
assert dim_size % world_size == 0, (
f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), "
f"cannot split tensor evenly"
)
tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
output = tensor_list[rank].contiguous()
return output
def _gather(input_, pg: dist.ProcessGroup, dim=-1):
# skip if only one rank involved
input_ = input_.contiguous()
world_size = dist.get_world_size(pg)
dist.get_rank(pg)
if world_size == 1:
return input_
# all gather
tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
assert input_.device.type == "cuda"
torch.distributed.all_gather(tensor_list, input_, group=pg)
# concat
output = torch.cat(tensor_list, dim=dim).contiguous()
return output
class _GatherForwardSplitBackward(torch.autograd.Function):
"""Gather the input from model parallel region and concatenate.
Args:
input_: input matrix.
process_group: parallel mode.
dim: dimension
"""
@staticmethod
def symbolic(graph, input_):
return _gather(input_)
@staticmethod
def forward(ctx, input_, process_group, dim, grad_scale):
ctx.mode = process_group
ctx.dim = dim
ctx.grad_scale = grad_scale
return _gather(input_, process_group, dim)
@staticmethod
def backward(ctx, grad_output):
if ctx.grad_scale == "up":
grad_output = grad_output * dist.get_world_size(ctx.mode)
elif ctx.grad_scale == "down":
grad_output = grad_output / dist.get_world_size(ctx.mode)
return _split(grad_output, ctx.mode, ctx.dim), None, None, None
class _SplitForwardGatherBackward(torch.autograd.Function):
"""
Split the input and keep only the corresponding chuck to the rank.
Args:
input_: input matrix.
process_group: parallel mode.
dim: dimension
"""
@staticmethod
def symbolic(graph, input_):
return _split(input_)
@staticmethod
def forward(ctx, input_, process_group, dim, grad_scale):
ctx.mode = process_group
ctx.dim = dim
ctx.grad_scale = grad_scale
return _split(input_, process_group, dim)
@staticmethod
def backward(ctx, grad_output):
if ctx.grad_scale == "up":
grad_output = grad_output * dist.get_world_size(ctx.mode)
elif ctx.grad_scale == "down":
grad_output = grad_output / dist.get_world_size(ctx.mode)
return _gather(grad_output, ctx.mode, ctx.dim), None, None, None
def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0):
return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale)
def gather_forward_split_backward(input_, process_group, dim, grad_scale=None):
return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale)

View File

@ -0,0 +1,29 @@
import torch.distributed as dist
_GLOBAL_PARALLEL_GROUPS = dict()
def set_data_parallel_group(group: dist.ProcessGroup):
_GLOBAL_PARALLEL_GROUPS["data"] = group
def get_data_parallel_group(get_mixed_dp_pg : bool = False):
if get_mixed_dp_pg and "mixed_dp_group" in _GLOBAL_PARALLEL_GROUPS:
return _GLOBAL_PARALLEL_GROUPS["mixed_dp_group"]
return _GLOBAL_PARALLEL_GROUPS.get("data", dist.group.WORLD)
def set_sequence_parallel_group(group: dist.ProcessGroup):
_GLOBAL_PARALLEL_GROUPS["sequence"] = group
def get_sequence_parallel_group():
return _GLOBAL_PARALLEL_GROUPS.get("sequence", None)
def set_tensor_parallel_group(group: dist.ProcessGroup):
_GLOBAL_PARALLEL_GROUPS["tensor"] = group
def get_tensor_parallel_group():
return _GLOBAL_PARALLEL_GROUPS.get("tensor", None)

View File

@ -0,0 +1,39 @@
import torch
import torch.nn as nn
class T5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
# T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
# Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
# w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
# half-precision inputs is done in fp32
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
# convert into half-precision if necessary
if self.weight.dtype in [torch.float16, torch.bfloat16]:
hidden_states = hidden_states.to(self.weight.dtype)
return self.weight * hidden_states
@staticmethod
def from_native_module(module, *args, **kwargs):
assert module.__class__.__name__ == "FusedRMSNorm", (
"Recovering T5LayerNorm requires the original layer to be apex's Fused RMS Norm."
"Apex's fused norm is automatically used by Hugging Face Transformers https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L265C5-L265C48"
)
layer_norm = T5LayerNorm(module.normalized_shape, eps=module.eps)
layer_norm.weight.data.copy_(module.weight.data)
layer_norm = layer_norm.to(module.weight.device)
return layer_norm

View File

@ -0,0 +1,41 @@
from colossalai.shardformer.modeling.jit import get_jit_fused_dropout_add_func
from colossalai.shardformer.modeling.t5 import get_jit_fused_T5_layer_ff_forward, get_T5_layer_self_attention_forward
from colossalai.shardformer.policies.base_policy import Policy, SubModuleReplacementDescription
class T5EncoderPolicy(Policy):
def config_sanity_check(self):
assert not self.shard_config.enable_tensor_parallelism
assert not self.shard_config.enable_flash_attention
def preprocess(self):
return self.model
def module_policy(self):
from transformers.models.t5.modeling_t5 import T5LayerFF, T5LayerSelfAttention, T5Stack
policy = {}
# use jit operator
if self.shard_config.enable_jit_fused:
self.append_or_create_method_replacement(
description={
"forward": get_jit_fused_T5_layer_ff_forward(),
"dropout_add": get_jit_fused_dropout_add_func(),
},
policy=policy,
target_key=T5LayerFF,
)
self.append_or_create_method_replacement(
description={
"forward": get_T5_layer_self_attention_forward(),
"dropout_add": get_jit_fused_dropout_add_func(),
},
policy=policy,
target_key=T5LayerSelfAttention,
)
return policy
def postprocess(self):
return self.model

View File

@ -0,0 +1,5 @@
from .dc_ae import *
from .hunyuan_vae import *
from .mmdit import *
from .text import *
from .vae import *

View File

@ -0,0 +1 @@
from .ae_model_zoo import DC_AE

View File

@ -0,0 +1,85 @@
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
from typing import Callable, Optional
import diffusers
import torch
from huggingface_hub import PyTorchModelHubMixin
from torch import nn
from opensora.registry import MODELS
from opensora.utils.ckpt import load_checkpoint
from .models.dc_ae import DCAE, DCAEConfig, dc_ae_f32
__all__ = ["create_dc_ae_model_cfg", "DCAE_HF", "DC_AE"]
REGISTERED_DCAE_MODEL: dict[str, tuple[Callable, Optional[str]]] = {
"dc-ae-f32t4c128": (dc_ae_f32, None),
}
def create_dc_ae_model_cfg(name: str, pretrained_path: Optional[str] = None) -> DCAEConfig:
assert name in REGISTERED_DCAE_MODEL, f"{name} is not supported"
dc_ae_cls, default_pt_path = REGISTERED_DCAE_MODEL[name]
pretrained_path = default_pt_path if pretrained_path is None else pretrained_path
model_cfg = dc_ae_cls(name, pretrained_path)
return model_cfg
class DCAE_HF(DCAE, PyTorchModelHubMixin):
def __init__(self, model_name: str):
cfg = create_dc_ae_model_cfg(model_name)
DCAE.__init__(self, cfg)
@MODELS.register_module("dc_ae")
def DC_AE(
model_name: str,
device_map: str | torch.device = "cuda",
torch_dtype: torch.dtype = torch.bfloat16,
from_scratch: bool = False,
from_pretrained: str | None = None,
is_training: bool = False,
use_spatial_tiling: bool = False,
use_temporal_tiling: bool = False,
spatial_tile_size: int = 256,
temporal_tile_size: int = 32,
tile_overlap_factor: float = 0.25,
scaling_factor: float = None,
disc_off_grad_ckpt: bool = False,
) -> DCAE_HF:
if not from_scratch:
model = DCAE_HF.from_pretrained(model_name).to(device_map, torch_dtype)
else:
model = DCAE_HF(model_name).to(device_map, torch_dtype)
if from_pretrained is not None:
model = load_checkpoint(model, from_pretrained, device_map=device_map)
print(f"loaded dc_ae from ckpt path: {from_pretrained}")
model.cfg.is_training = is_training
model.use_spatial_tiling = use_spatial_tiling
model.use_temporal_tiling = use_temporal_tiling
model.spatial_tile_size = spatial_tile_size
model.temporal_tile_size = temporal_tile_size
model.tile_overlap_factor = tile_overlap_factor
if scaling_factor is not None:
model.scaling_factor = scaling_factor
model.decoder.disc_off_grad_ckpt = disc_off_grad_ckpt
return model

View File

@ -0,0 +1 @@
from .dc_ae import *

View File

@ -0,0 +1,815 @@
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass, field
from typing import Any, Optional
import torch
import torch.nn as nn
from omegaconf import MISSING, OmegaConf
from torch import Tensor
from opensora.acceleration.checkpoint import auto_grad_checkpoint
from ..utils import init_modules
from .nn.act import build_act
from .nn.norm import build_norm
from .nn.ops import (
ChannelDuplicatingPixelShuffleUpSampleLayer,
ConvLayer,
ConvPixelShuffleUpSampleLayer,
ConvPixelUnshuffleDownSampleLayer,
EfficientViTBlock,
IdentityLayer,
InterpolateConvUpSampleLayer,
OpSequential,
PixelUnshuffleChannelAveragingDownSampleLayer,
ResBlock,
ResidualBlock,
)
__all__ = ["DCAE", "dc_ae_f32"]
@dataclass
class EncoderConfig:
in_channels: int = MISSING
latent_channels: int = MISSING
width_list: tuple[int, ...] = (128, 256, 512, 512, 1024, 1024)
depth_list: tuple[int, ...] = (2, 2, 2, 2, 2, 2)
block_type: Any = "ResBlock"
norm: str = "rms2d"
act: str = "silu"
downsample_block_type: str = "ConvPixelUnshuffle"
downsample_match_channel: bool = True
downsample_shortcut: Optional[str] = "averaging"
out_norm: Optional[str] = None
out_act: Optional[str] = None
out_shortcut: Optional[str] = "averaging"
double_latent: bool = False
is_video: bool = False
temporal_downsample: tuple[bool, ...] = ()
@dataclass
class DecoderConfig:
in_channels: int = MISSING
latent_channels: int = MISSING
in_shortcut: Optional[str] = "duplicating"
width_list: tuple[int, ...] = (128, 256, 512, 512, 1024, 1024)
depth_list: tuple[int, ...] = (2, 2, 2, 2, 2, 2)
block_type: Any = "ResBlock"
norm: Any = "rms2d"
act: Any = "silu"
upsample_block_type: str = "ConvPixelShuffle"
upsample_match_channel: bool = True
upsample_shortcut: str = "duplicating"
out_norm: str = "rms2d"
out_act: str = "relu"
is_video: bool = False
temporal_upsample: tuple[bool, ...] = ()
@dataclass
class DCAEConfig:
in_channels: int = 3
latent_channels: int = 32
time_compression_ratio: int = 1
spatial_compression_ratio: int = 32
encoder: EncoderConfig = field(
default_factory=lambda: EncoderConfig(in_channels="${..in_channels}", latent_channels="${..latent_channels}")
)
decoder: DecoderConfig = field(
default_factory=lambda: DecoderConfig(in_channels="${..in_channels}", latent_channels="${..latent_channels}")
)
use_quant_conv: bool = False
pretrained_path: Optional[str] = None
pretrained_source: str = "dc-ae"
scaling_factor: Optional[float] = None
is_image_model: bool = False
is_training: bool = False # NOTE: set to True in vae train config
use_spatial_tiling: bool = False
use_temporal_tiling: bool = False
spatial_tile_size: int = 256
temporal_tile_size: int = 32
tile_overlap_factor: float = 0.25
def build_block(
block_type: str, in_channels: int, out_channels: int, norm: Optional[str], act: Optional[str], is_video: bool
) -> nn.Module:
if block_type == "ResBlock":
assert in_channels == out_channels
main_block = ResBlock(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
use_bias=(True, False),
norm=(None, norm),
act_func=(act, None),
is_video=is_video,
)
block = ResidualBlock(main_block, IdentityLayer())
elif block_type == "EViT_GLU":
assert in_channels == out_channels
block = EfficientViTBlock(
in_channels, norm=norm, act_func=act, local_module="GLUMBConv", scales=(), is_video=is_video
)
elif block_type == "EViTS5_GLU":
assert in_channels == out_channels
block = EfficientViTBlock(
in_channels, norm=norm, act_func=act, local_module="GLUMBConv", scales=(5,), is_video=is_video
)
else:
raise ValueError(f"block_type {block_type} is not supported")
return block
def build_stage_main(
width: int, depth: int, block_type: str | list[str], norm: str, act: str, input_width: int, is_video: bool
) -> list[nn.Module]:
assert isinstance(block_type, str) or (isinstance(block_type, list) and depth == len(block_type))
stage = []
for d in range(depth):
current_block_type = block_type[d] if isinstance(block_type, list) else block_type
block = build_block(
block_type=current_block_type,
in_channels=width if d > 0 else input_width,
out_channels=width,
norm=norm,
act=act,
is_video=is_video,
)
stage.append(block)
return stage
def build_downsample_block(
block_type: str,
in_channels: int,
out_channels: int,
shortcut: Optional[str],
is_video: bool,
temporal_downsample: bool = False,
) -> nn.Module:
"""
Spatial downsample is always performed. Temporal downsample is optional.
"""
if block_type == "Conv":
if is_video:
if temporal_downsample:
stride = (2, 2, 2)
else:
stride = (1, 2, 2)
else:
stride = 2
block = ConvLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
use_bias=True,
norm=None,
act_func=None,
is_video=is_video,
)
elif block_type == "ConvPixelUnshuffle":
if is_video:
raise NotImplementedError("ConvPixelUnshuffle downsample is not supported for video")
block = ConvPixelUnshuffleDownSampleLayer(
in_channels=in_channels, out_channels=out_channels, kernel_size=3, factor=2
)
else:
raise ValueError(f"block_type {block_type} is not supported for downsampling")
if shortcut is None:
pass
elif shortcut == "averaging":
shortcut_block = PixelUnshuffleChannelAveragingDownSampleLayer(
in_channels=in_channels, out_channels=out_channels, factor=2, temporal_downsample=temporal_downsample
)
block = ResidualBlock(block, shortcut_block)
else:
raise ValueError(f"shortcut {shortcut} is not supported for downsample")
return block
def build_upsample_block(
block_type: str,
in_channels: int,
out_channels: int,
shortcut: Optional[str],
is_video: bool,
temporal_upsample: bool = False,
) -> nn.Module:
if block_type == "ConvPixelShuffle":
if is_video:
raise NotImplementedError("ConvPixelShuffle upsample is not supported for video")
block = ConvPixelShuffleUpSampleLayer(
in_channels=in_channels, out_channels=out_channels, kernel_size=3, factor=2
)
elif block_type == "InterpolateConv":
block = InterpolateConvUpSampleLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
factor=2,
is_video=is_video,
temporal_upsample=temporal_upsample,
)
else:
raise ValueError(f"block_type {block_type} is not supported for upsampling")
if shortcut is None:
pass
elif shortcut == "duplicating":
shortcut_block = ChannelDuplicatingPixelShuffleUpSampleLayer(
in_channels=in_channels, out_channels=out_channels, factor=2, temporal_upsample=temporal_upsample
)
block = ResidualBlock(block, shortcut_block)
else:
raise ValueError(f"shortcut {shortcut} is not supported for upsample")
return block
def build_encoder_project_in_block(
in_channels: int, out_channels: int, factor: int, downsample_block_type: str, is_video: bool
):
if factor == 1:
block = ConvLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
use_bias=True,
norm=None,
act_func=None,
is_video=is_video,
)
elif factor == 2:
if is_video:
raise NotImplementedError("Downsample during project_in is not supported for video")
block = build_downsample_block(
block_type=downsample_block_type, in_channels=in_channels, out_channels=out_channels, shortcut=None
)
else:
raise ValueError(f"downsample factor {factor} is not supported for encoder project in")
return block
def build_encoder_project_out_block(
in_channels: int,
out_channels: int,
norm: Optional[str],
act: Optional[str],
shortcut: Optional[str],
is_video: bool,
):
block = OpSequential(
[
build_norm(norm),
build_act(act),
ConvLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
use_bias=True,
norm=None,
act_func=None,
is_video=is_video,
),
]
)
if shortcut is None:
pass
elif shortcut == "averaging":
shortcut_block = PixelUnshuffleChannelAveragingDownSampleLayer(
in_channels=in_channels, out_channels=out_channels, factor=1
)
block = ResidualBlock(block, shortcut_block)
else:
raise ValueError(f"shortcut {shortcut} is not supported for encoder project out")
return block
def build_decoder_project_in_block(in_channels: int, out_channels: int, shortcut: Optional[str], is_video: bool):
block = ConvLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
use_bias=True,
norm=None,
act_func=None,
is_video=is_video,
)
if shortcut is None:
pass
elif shortcut == "duplicating":
shortcut_block = ChannelDuplicatingPixelShuffleUpSampleLayer(
in_channels=in_channels, out_channels=out_channels, factor=1
)
block = ResidualBlock(block, shortcut_block)
else:
raise ValueError(f"shortcut {shortcut} is not supported for decoder project in")
return block
def build_decoder_project_out_block(
in_channels: int,
out_channels: int,
factor: int,
upsample_block_type: str,
norm: Optional[str],
act: Optional[str],
is_video: bool,
):
layers: list[nn.Module] = [
build_norm(norm, in_channels),
build_act(act),
]
if factor == 1:
layers.append(
ConvLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
use_bias=True,
norm=None,
act_func=None,
is_video=is_video,
)
)
elif factor == 2:
if is_video:
raise NotImplementedError("Upsample during project_out is not supported for video")
layers.append(
build_upsample_block(
block_type=upsample_block_type, in_channels=in_channels, out_channels=out_channels, shortcut=None
)
)
else:
raise ValueError(f"upsample factor {factor} is not supported for decoder project out")
return OpSequential(layers)
class Encoder(nn.Module):
def __init__(self, cfg: EncoderConfig):
super().__init__()
self.cfg = cfg
num_stages = len(cfg.width_list)
self.num_stages = num_stages
assert len(cfg.depth_list) == num_stages
assert len(cfg.width_list) == num_stages
assert isinstance(cfg.block_type, str) or (
isinstance(cfg.block_type, list) and len(cfg.block_type) == num_stages
)
self.project_in = build_encoder_project_in_block(
in_channels=cfg.in_channels,
out_channels=cfg.width_list[0] if cfg.depth_list[0] > 0 else cfg.width_list[1],
factor=1 if cfg.depth_list[0] > 0 else 2,
downsample_block_type=cfg.downsample_block_type,
is_video=cfg.is_video,
)
self.stages: list[OpSequential] = []
for stage_id, (width, depth) in enumerate(zip(cfg.width_list, cfg.depth_list)):
block_type = cfg.block_type[stage_id] if isinstance(cfg.block_type, list) else cfg.block_type
stage = build_stage_main(
width=width,
depth=depth,
block_type=block_type,
norm=cfg.norm,
act=cfg.act,
input_width=width,
is_video=cfg.is_video,
)
if stage_id < num_stages - 1 and depth > 0:
downsample_block = build_downsample_block(
block_type=cfg.downsample_block_type,
in_channels=width,
out_channels=cfg.width_list[stage_id + 1] if cfg.downsample_match_channel else width,
shortcut=cfg.downsample_shortcut,
is_video=cfg.is_video,
temporal_downsample=cfg.temporal_downsample[stage_id] if cfg.temporal_downsample != [] else False,
)
stage.append(downsample_block)
self.stages.append(OpSequential(stage))
self.stages = nn.ModuleList(self.stages)
self.project_out = build_encoder_project_out_block(
in_channels=cfg.width_list[-1],
out_channels=2 * cfg.latent_channels if cfg.double_latent else cfg.latent_channels,
norm=cfg.out_norm,
act=cfg.out_act,
shortcut=cfg.out_shortcut,
is_video=cfg.is_video,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.project_in(x)
# x = auto_grad_checkpoint(self.project_in, x)
for stage in self.stages:
if len(stage.op_list) == 0:
continue
x = auto_grad_checkpoint(stage, x)
# x = self.project_out(x)
x = auto_grad_checkpoint(self.project_out, x)
return x
class Decoder(nn.Module):
def __init__(self, cfg: DecoderConfig):
super().__init__()
self.cfg = cfg
num_stages = len(cfg.width_list)
self.num_stages = num_stages
assert len(cfg.depth_list) == num_stages
assert len(cfg.width_list) == num_stages
assert isinstance(cfg.block_type, str) or (
isinstance(cfg.block_type, list) and len(cfg.block_type) == num_stages
)
assert isinstance(cfg.norm, str) or (isinstance(cfg.norm, list) and len(cfg.norm) == num_stages)
assert isinstance(cfg.act, str) or (isinstance(cfg.act, list) and len(cfg.act) == num_stages)
self.project_in = build_decoder_project_in_block(
in_channels=cfg.latent_channels,
out_channels=cfg.width_list[-1],
shortcut=cfg.in_shortcut,
is_video=cfg.is_video,
)
self.stages: list[OpSequential] = []
for stage_id, (width, depth) in reversed(list(enumerate(zip(cfg.width_list, cfg.depth_list)))):
stage = []
if stage_id < num_stages - 1 and depth > 0:
upsample_block = build_upsample_block(
block_type=cfg.upsample_block_type,
in_channels=cfg.width_list[stage_id + 1],
out_channels=width if cfg.upsample_match_channel else cfg.width_list[stage_id + 1],
shortcut=cfg.upsample_shortcut,
is_video=cfg.is_video,
temporal_upsample=cfg.temporal_upsample[stage_id] if cfg.temporal_upsample != [] else False,
)
stage.append(upsample_block)
block_type = cfg.block_type[stage_id] if isinstance(cfg.block_type, list) else cfg.block_type
norm = cfg.norm[stage_id] if isinstance(cfg.norm, list) else cfg.norm
act = cfg.act[stage_id] if isinstance(cfg.act, list) else cfg.act
stage.extend(
build_stage_main(
width=width,
depth=depth,
block_type=block_type,
norm=norm,
act=act,
input_width=(
width if cfg.upsample_match_channel else cfg.width_list[min(stage_id + 1, num_stages - 1)]
),
is_video=cfg.is_video,
)
)
self.stages.insert(0, OpSequential(stage))
self.stages = nn.ModuleList(self.stages)
self.project_out = build_decoder_project_out_block(
in_channels=cfg.width_list[0] if cfg.depth_list[0] > 0 else cfg.width_list[1],
out_channels=cfg.in_channels,
factor=1 if cfg.depth_list[0] > 0 else 2,
upsample_block_type=cfg.upsample_block_type,
norm=cfg.out_norm,
act=cfg.out_act,
is_video=cfg.is_video,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = auto_grad_checkpoint(self.project_in, x)
for stage in reversed(self.stages):
if len(stage.op_list) == 0:
continue
# x = stage(x)
x = auto_grad_checkpoint(stage, x)
if self.disc_off_grad_ckpt:
x = self.project_out(x)
else:
x = auto_grad_checkpoint(self.project_out, x)
return x
class DCAE(nn.Module):
def __init__(self, cfg: DCAEConfig):
super().__init__()
self.cfg = cfg
self.encoder = Encoder(cfg.encoder)
self.decoder = Decoder(cfg.decoder)
self.scaling_factor = cfg.scaling_factor
self.time_compression_ratio = cfg.time_compression_ratio
self.spatial_compression_ratio = cfg.spatial_compression_ratio
self.use_spatial_tiling = cfg.use_spatial_tiling
self.use_temporal_tiling = cfg.use_temporal_tiling
self.spatial_tile_size = cfg.spatial_tile_size
self.temporal_tile_size = cfg.temporal_tile_size
assert (
cfg.spatial_tile_size // cfg.spatial_compression_ratio
), f"spatial tile size {cfg.spatial_tile_size} must be divisible by spatial compression of {cfg.spatial_compression_ratio}"
self.spatial_tile_latent_size = cfg.spatial_tile_size // cfg.spatial_compression_ratio
assert (
cfg.temporal_tile_size // cfg.time_compression_ratio
), f"temporal tile size {cfg.temporal_tile_size} must be divisible by temporal compression of {cfg.time_compression_ratio}"
self.temporal_tile_latent_size = cfg.temporal_tile_size // cfg.time_compression_ratio
self.tile_overlap_factor = cfg.tile_overlap_factor
if self.cfg.pretrained_path is not None:
self.load_model()
self.to(torch.float32)
init_modules(self, init_type="trunc_normal")
def load_model(self):
if self.cfg.pretrained_source == "dc-ae":
state_dict = torch.load(self.cfg.pretrained_path, map_location="cpu", weights_only=True)["state_dict"]
self.load_state_dict(state_dict)
else:
raise NotImplementedError
def get_last_layer(self):
return self.decoder.project_out.op_list[2].conv.weight
# @property
# def spatial_compression_ratio(self) -> int:
# return 2 ** (self.decoder.num_stages - 1)
def encode_single(self, x: torch.Tensor, is_video_encoder: bool = False) -> torch.Tensor:
assert x.shape[0] == 1
is_video = x.dim() == 5
if is_video and not is_video_encoder:
b, c, f, h, w = x.shape
x = x.permute(0, 2, 1, 3, 4).reshape(-1, c, h, w)
z = self.encoder(x)
if is_video and not is_video_encoder:
z = z.unsqueeze(dim=0).permute(0, 2, 1, 3, 4)
if self.scaling_factor is not None:
z = z / self.scaling_factor
return z
def _encode(self, x: torch.Tensor) -> torch.Tensor:
if self.cfg.is_training:
return self.encoder(x)
is_video_encoder = self.encoder.cfg.is_video if self.encoder.cfg.is_video is not None else False
x_ret = []
for i in range(x.shape[0]):
x_ret.append(self.encode_single(x[i : i + 1], is_video_encoder))
return torch.cat(x_ret, dim=0)
def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
for y in range(blend_extent):
b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
y / blend_extent
)
return b
def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
for x in range(blend_extent):
b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
x / blend_extent
)
return b
def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
for x in range(blend_extent):
b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
x / blend_extent
)
return b
def spatial_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
net_size = int(self.spatial_tile_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.spatial_tile_latent_size * self.tile_overlap_factor)
row_limit = self.spatial_tile_latent_size - blend_extent
# Split video into tiles and encode them separately.
rows = []
for i in range(0, x.shape[-2], net_size):
row = []
for j in range(0, x.shape[-1], net_size):
tile = x[:, :, :, i : i + self.spatial_tile_size, j : j + self.spatial_tile_size]
tile = self._encode(tile)
row.append(tile)
rows.append(row)
result_rows = []
for i, row in enumerate(rows):
result_row = []
for j, tile in enumerate(row):
# blend the above tile and the left tile
# to the current tile and add the current tile to the result row
if i > 0:
tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
if j > 0:
tile = self.blend_h(row[j - 1], tile, blend_extent)
result_row.append(tile[:, :, :, :row_limit, :row_limit])
result_rows.append(torch.cat(result_row, dim=-1))
return torch.cat(result_rows, dim=-2)
def temporal_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
overlap_size = int(self.temporal_tile_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.temporal_tile_latent_size * self.tile_overlap_factor)
t_limit = self.temporal_tile_latent_size - blend_extent
# Split the video into tiles and encode them separately.
row = []
for i in range(0, x.shape[2], overlap_size):
tile = x[:, :, i : i + self.temporal_tile_size, :, :]
if self.use_spatial_tiling and (
tile.shape[-1] > self.spatial_tile_size or tile.shape[-2] > self.spatial_tile_size
):
tile = self.spatial_tiled_encode(tile)
else:
tile = self._encode(tile)
row.append(tile)
result_row = []
for i, tile in enumerate(row):
if i > 0:
tile = self.blend_t(row[i - 1], tile, blend_extent)
result_row.append(tile[:, :, :t_limit, :, :])
return torch.cat(result_row, dim=2)
def encode(self, x: torch.Tensor) -> torch.Tensor:
if self.use_temporal_tiling and x.shape[2] > self.temporal_tile_size:
return self.temporal_tiled_encode(x)
elif self.use_spatial_tiling and (x.shape[-1] > self.spatial_tile_size or x.shape[-2] > self.spatial_tile_size):
return self.spatial_tiled_encode(x)
else:
return self._encode(x)
def spatial_tiled_decode(self, z: torch.FloatTensor) -> torch.Tensor:
net_size = int(self.spatial_tile_latent_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.spatial_tile_size * self.tile_overlap_factor)
row_limit = self.spatial_tile_size - blend_extent
# Split z into overlapping tiles and decode them separately.
# The tiles have an overlap to avoid seams between tiles.
rows = []
for i in range(0, z.shape[-2], net_size):
row = []
for j in range(0, z.shape[-1], net_size):
tile = z[:, :, :, i : i + self.spatial_tile_latent_size, j : j + self.spatial_tile_latent_size]
decoded = self._decode(tile)
row.append(decoded)
rows.append(row)
result_rows = []
for i, row in enumerate(rows):
result_row = []
for j, tile in enumerate(row):
# blend the above tile and the left tile
# to the current tile and add the current tile to the result row
if i > 0:
tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
if j > 0:
tile = self.blend_h(row[j - 1], tile, blend_extent)
result_row.append(tile[:, :, :, :row_limit, :row_limit])
result_rows.append(torch.cat(result_row, dim=-1))
return torch.cat(result_rows, dim=-2)
def temporal_tiled_decode(self, z: torch.Tensor) -> torch.Tensor:
overlap_size = int(self.temporal_tile_latent_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.temporal_tile_size * self.tile_overlap_factor)
t_limit = self.temporal_tile_size - blend_extent
row = []
for i in range(0, z.shape[2], overlap_size):
tile = z[:, :, i : i + self.temporal_tile_latent_size, :, :]
if self.use_spatial_tiling and (
tile.shape[-1] > self.spatial_tile_latent_size or tile.shape[-2] > self.spatial_tile_latent_size
):
decoded = self.spatial_tiled_decode(tile)
else:
decoded = self._decode(tile)
row.append(decoded)
result_row = []
for i, tile in enumerate(row):
if i > 0:
tile = self.blend_t(row[i - 1], tile, blend_extent)
result_row.append(tile[:, :, :t_limit, :, :])
return torch.cat(result_row, dim=2)
def decode_single(self, z: torch.Tensor, is_video_decoder: bool = False) -> torch.Tensor:
assert z.shape[0] == 1
is_video = z.dim() == 5
if is_video and not is_video_decoder:
b, c, f, h, w = z.shape
z = z.permute(0, 2, 1, 3, 4).reshape(-1, c, h, w)
if self.scaling_factor is not None:
z = z * self.scaling_factor
x = self.decoder(z)
if is_video and not is_video_decoder:
x = x.unsqueeze(dim=0).permute(0, 2, 1, 3, 4)
return x
def _decode(self, z: torch.Tensor) -> torch.Tensor:
if self.cfg.is_training:
return self.decoder(z)
is_video_decoder = self.decoder.cfg.is_video if self.decoder.cfg.is_video is not None else False
x_ret = []
for i in range(z.shape[0]):
x_ret.append(self.decode_single(z[i : i + 1], is_video_decoder))
return torch.cat(x_ret, dim=0)
def decode(self, z: torch.Tensor) -> torch.Tensor:
if self.use_temporal_tiling and z.shape[2] > self.temporal_tile_latent_size:
return self.temporal_tiled_decode(z)
elif self.use_spatial_tiling and (
z.shape[-1] > self.spatial_tile_latent_size or z.shape[-2] > self.spatial_tile_latent_size
):
return self.spatial_tiled_decode(z)
else:
return self._decode(z)
def forward(self, x: torch.Tensor) -> tuple[Any, Tensor, dict[Any, Any]]:
x_type = x.dtype
is_image_model = self.cfg.__dict__.get("is_image_model", False)
x = x.to(self.encoder.project_in.conv.weight.dtype)
if is_image_model:
b, c, _, h, w = x.shape
x = x.permute(0, 2, 1, 3, 4).reshape(-1, c, h, w)
z = self.encode(x)
dec = self.decode(z)
if is_image_model:
dec = dec.reshape(b, 1, c, h, w).permute(0, 2, 1, 3, 4)
z = z.unsqueeze(dim=0).permute(0, 2, 1, 3, 4)
dec = dec.to(x_type)
return dec, None, z
def get_latent_size(self, input_size: list[int]) -> list[int]:
latent_size = []
# T
latent_size.append((input_size[0] - 1) // self.time_compression_ratio + 1)
# H, w
for i in range(1, 3):
latent_size.append((input_size[i] - 1) // self.spatial_compression_ratio + 1)
return latent_size
def dc_ae_f32(name: str, pretrained_path: str) -> DCAEConfig:
if name in ["dc-ae-f32t4c128"]:
cfg_str = (
"time_compression_ratio=4 "
"spatial_compression_ratio=32 "
"encoder.block_type=[ResBlock,ResBlock,ResBlock,EViTS5_GLU,EViTS5_GLU,EViTS5_GLU] "
"encoder.width_list=[128,256,512,512,1024,1024] encoder.depth_list=[2,2,2,3,3,3] "
"encoder.downsample_block_type=Conv "
"encoder.norm=rms3d "
"encoder.is_video=True "
"decoder.block_type=[ResBlock,ResBlock,ResBlock,EViTS5_GLU,EViTS5_GLU,EViTS5_GLU] "
"decoder.width_list=[128,256,512,512,1024,1024] decoder.depth_list=[3,3,3,3,3,3] "
"decoder.upsample_block_type=InterpolateConv "
"decoder.norm=rms3d decoder.act=silu decoder.out_norm=rms3d "
"decoder.is_video=True "
"encoder.temporal_downsample=[False,False,False,True,True,False] "
"decoder.temporal_upsample=[False,False,False,True,True,False] "
"latent_channels=128"
) # make sure there is no trailing blankspace in the last line
else:
raise NotImplementedError
cfg = OmegaConf.from_dotlist(cfg_str.split(" "))
cfg: DCAEConfig = OmegaConf.to_object(OmegaConf.merge(OmegaConf.structured(DCAEConfig), cfg))
cfg.pretrained_path = pretrained_path
return cfg

View File

@ -0,0 +1,3 @@
from .act import *
from .norm import *
from .ops import *

View File

@ -0,0 +1,44 @@
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
from functools import partial
from typing import Optional
import torch.nn as nn
from ..nn.vo_ops import build_kwargs_from_config
__all__ = ["build_act"]
# register activation function here
REGISTERED_ACT_DICT: dict[str, type] = {
"relu": nn.ReLU,
"relu6": nn.ReLU6,
"hswish": nn.Hardswish,
"silu": nn.SiLU,
"gelu": partial(nn.GELU, approximate="tanh"),
}
def build_act(name: str, **kwargs) -> Optional[nn.Module]:
if name in REGISTERED_ACT_DICT:
act_cls = REGISTERED_ACT_DICT[name]
args = build_kwargs_from_config(kwargs, act_cls)
return act_cls(**args)
else:
return None

View File

@ -0,0 +1,98 @@
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
from typing import Optional
import torch
import torch.nn as nn
from torch.nn.modules.batchnorm import _BatchNorm
from ..nn.vo_ops import build_kwargs_from_config
__all__ = ["LayerNorm2d", "build_norm", "set_norm_eps"]
class LayerNorm2d(nn.LayerNorm):
def forward(self, x: torch.Tensor) -> torch.Tensor:
out = x - torch.mean(x, dim=1, keepdim=True)
out = out / torch.sqrt(torch.square(out).mean(dim=1, keepdim=True) + self.eps)
if self.elementwise_affine:
out = out * self.weight.view(1, -1, 1, 1) + self.bias.view(1, -1, 1, 1)
return out
class RMSNorm2d(nn.Module):
def __init__(
self, num_features: int, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True
) -> None:
super().__init__()
self.num_features = num_features
self.eps = eps
self.elementwise_affine = elementwise_affine
if self.elementwise_affine:
self.weight = torch.nn.parameter.Parameter(torch.empty(self.num_features))
if bias:
self.bias = torch.nn.parameter.Parameter(torch.empty(self.num_features))
else:
self.register_parameter("bias", None)
else:
self.register_parameter("weight", None)
self.register_parameter("bias", None)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = (x / torch.sqrt(torch.square(x.float()).mean(dim=1, keepdim=True) + self.eps)).to(x.dtype)
if self.elementwise_affine:
x = x * self.weight.view(1, -1, 1, 1) + self.bias.view(1, -1, 1, 1)
return x
class RMSNorm3d(RMSNorm2d):
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = (x / torch.sqrt(torch.square(x.float()).mean(dim=1, keepdim=True) + self.eps)).to(x.dtype)
if self.elementwise_affine:
x = x * self.weight.view(1, -1, 1, 1, 1) + self.bias.view(1, -1, 1, 1, 1)
return x
# register normalization function here
REGISTERED_NORM_DICT: dict[str, type] = {
"bn2d": nn.BatchNorm2d,
"ln": nn.LayerNorm,
"ln2d": LayerNorm2d,
"rms2d": RMSNorm2d,
"rms3d": RMSNorm3d,
}
def build_norm(name="bn2d", num_features=None, **kwargs) -> Optional[nn.Module]:
if name in ["ln", "ln2d"]:
kwargs["normalized_shape"] = num_features
else:
kwargs["num_features"] = num_features
if name in REGISTERED_NORM_DICT:
norm_cls = REGISTERED_NORM_DICT[name]
args = build_kwargs_from_config(kwargs, norm_cls)
return norm_cls(**args)
else:
return None
def set_norm_eps(model: nn.Module, eps: Optional[float] = None) -> None:
for m in model.modules():
if isinstance(m, (nn.GroupNorm, nn.LayerNorm, _BatchNorm)):
if eps is not None:
m.eps = eps

View File

@ -0,0 +1,978 @@
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0 # upsample on the temporal dimension as well
from typing import Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from opensora.models.vae.utils import ChannelChunkConv3d
from ...models.nn.act import build_act
from ...models.nn.norm import build_norm
from ...models.nn.vo_ops import chunked_interpolate, get_same_padding, pixel_shuffle_3d, pixel_unshuffle_3d, resize
from ...utils import list_sum, val2list, val2tuple
__all__ = [
"ConvLayer",
"UpSampleLayer",
"ConvPixelUnshuffleDownSampleLayer",
"PixelUnshuffleChannelAveragingDownSampleLayer",
"ConvPixelShuffleUpSampleLayer",
"ChannelDuplicatingPixelShuffleUpSampleLayer",
"LinearLayer",
"IdentityLayer",
"DSConv",
"MBConv",
"FusedMBConv",
"ResBlock",
"LiteMLA",
"EfficientViTBlock",
"ResidualBlock",
"DAGBlock",
"OpSequential",
]
#################################################################################
# Basic Layers #
#################################################################################
class ConvLayer(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size=3,
stride=1,
dilation=1,
groups=1,
use_bias=False,
dropout=0,
norm="bn2d",
act_func="relu",
is_video=False,
pad_mode_3d="constant",
):
super().__init__()
self.is_video = is_video
if self.is_video:
assert dilation == 1, "only support dilation=1 for 3d conv"
assert kernel_size % 2 == 1, "only support odd kernel size for 3d conv"
self.pad_mode_3d = pad_mode_3d # 3d padding follows CausalConv3d by Hunyuan
# padding = (
# kernel_size // 2,
# kernel_size // 2,
# kernel_size // 2,
# kernel_size // 2,
# kernel_size - 1,
# 0,
# ) # W, H, T
# non-causal padding
padding = (
kernel_size // 2,
kernel_size // 2,
kernel_size // 2,
kernel_size // 2,
kernel_size // 2,
kernel_size // 2,
)
self.padding = padding
self.dropout = nn.Dropout3d(dropout, inplace=False) if dropout > 0 else None
assert isinstance(stride, (int, tuple)), "stride must be an integer or 3-tuple for 3d conv"
self.conv = ChannelChunkConv3d( # padding is handled by F.pad() in forward()
in_channels,
out_channels,
kernel_size=(kernel_size, kernel_size, kernel_size),
stride=(stride, stride, stride) if isinstance(stride, int) else stride,
groups=groups,
bias=use_bias,
)
else:
padding = get_same_padding(kernel_size)
padding *= dilation
self.dropout = nn.Dropout2d(dropout, inplace=False) if dropout > 0 else None
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=(kernel_size, kernel_size),
stride=(stride, stride),
padding=padding,
dilation=(dilation, dilation),
groups=groups,
bias=use_bias,
)
self.norm = build_norm(norm, num_features=out_channels)
self.act = build_act(act_func)
self.pad = F.pad
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.dropout is not None:
x = self.dropout(x)
if self.is_video: # custom padding for 3d conv
x = self.pad(x, self.padding, mode=self.pad_mode_3d) # "constant" padding defaults to 0
x = self.conv(x)
if self.norm:
x = self.norm(x)
if self.act:
x = self.act(x)
return x
class UpSampleLayer(nn.Module):
def __init__(
self,
mode="bicubic",
size: Optional[int | tuple[int, int] | list[int]] = None,
factor=2,
align_corners=False,
):
super().__init__()
self.mode = mode
self.size = val2list(size, 2) if size is not None else None
self.factor = None if self.size is not None else factor
self.align_corners = align_corners
@torch.autocast(device_type="cuda", enabled=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if (self.size is not None and tuple(x.shape[-2:]) == self.size) or self.factor == 1:
return x
if x.dtype in [torch.float16, torch.bfloat16]:
x = x.float()
return resize(x, self.size, self.factor, self.mode, self.align_corners)
class ConvPixelUnshuffleDownSampleLayer(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
factor: int,
):
super().__init__()
self.factor = factor
out_ratio = factor**2
assert out_channels % out_ratio == 0
self.conv = ConvLayer(
in_channels=in_channels,
out_channels=out_channels // out_ratio,
kernel_size=kernel_size,
use_bias=True,
norm=None,
act_func=None,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.conv(x)
x = F.pixel_unshuffle(x, self.factor)
return x
class PixelUnshuffleChannelAveragingDownSampleLayer(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
factor: int,
temporal_downsample: bool = False, # temporal downsample for 5d input tensor
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.factor = factor
self.temporal_downsample = temporal_downsample
def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.dim() == 4:
assert self.in_channels * self.factor**2 % self.out_channels == 0
group_size = self.in_channels * self.factor**2 // self.out_channels
x = F.pixel_unshuffle(x, self.factor)
B, C, H, W = x.shape
x = x.view(B, self.out_channels, group_size, H, W)
x = x.mean(dim=2)
elif x.dim() == 5: # [B, C, T, H, W]
_, _, T, _, _ = x.shape
if self.temporal_downsample and T != 1: # 3d pixel unshuffle
x = pixel_unshuffle_3d(x, self.factor)
assert self.in_channels * self.factor**3 % self.out_channels == 0
group_size = self.in_channels * self.factor**3 // self.out_channels
else: # 2d pixel unshuffle
x = x.permute(0, 2, 1, 3, 4) # [B, T, C, H, W]
x = F.pixel_unshuffle(x, self.factor)
x = x.permute(0, 2, 1, 3, 4) # [B, C, T, H, W]
assert self.in_channels * self.factor**2 % self.out_channels == 0
group_size = self.in_channels * self.factor**2 // self.out_channels
B, C, T, H, W = x.shape
x = x.view(B, self.out_channels, group_size, T, H, W)
x = x.mean(dim=2)
else:
raise ValueError(f"Unsupported input dimension: {x.dim()}")
return x
def __repr__(self):
return f"PixelUnshuffleChannelAveragingDownSampleLayer(in_channels={self.in_channels}, out_channels={self.out_channels}, factor={self.factor}), temporal_downsample={self.temporal_downsample}"
class ConvPixelShuffleUpSampleLayer(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
factor: int,
):
super().__init__()
self.factor = factor
out_ratio = factor**2
self.conv = ConvLayer(
in_channels=in_channels,
out_channels=out_channels * out_ratio,
kernel_size=kernel_size,
use_bias=True,
norm=None,
act_func=None,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.conv(x)
x = F.pixel_shuffle(x, self.factor)
return x
class InterpolateConvUpSampleLayer(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
factor: int,
mode: str = "nearest",
is_video: bool = False,
temporal_upsample: bool = False,
) -> None:
super().__init__()
self.factor = factor
self.mode = mode
self.temporal_upsample = temporal_upsample
self.conv = ConvLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
use_bias=True,
norm=None,
act_func=None,
is_video=is_video,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.dim() == 4:
x = F.interpolate(x, scale_factor=self.factor, mode=self.mode)
elif x.dim() == 5:
# [B, C, T, H, W] -> [B, C, T*factor, H*factor, W*factor]
if self.temporal_upsample and x.size(2) != 1: # temporal upsample for video input
x = chunked_interpolate(x, scale_factor=[self.factor, self.factor, self.factor], mode=self.mode)
else:
x = chunked_interpolate(x, scale_factor=[1, self.factor, self.factor], mode=self.mode)
x = self.conv(x)
return x
def __repr__(self):
return f"InterpolateConvUpSampleLayer(factor={self.factor}, mode={self.mode}, temporal_upsample={self.temporal_upsample})"
class ChannelDuplicatingPixelShuffleUpSampleLayer(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
factor: int,
temporal_upsample: bool = False, # upsample on the temporal dimension as well
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.factor = factor
assert out_channels * factor**2 % in_channels == 0
self.temporal_upsample = temporal_upsample
def forward(self, x: torch.Tensor) -> torch.Tensor:
if x.dim() == 5:
B, C, T, H, W = x.shape
assert C == self.in_channels
if self.temporal_upsample and T != 1: # video input
repeats = self.out_channels * self.factor**3 // self.in_channels
else:
repeats = self.out_channels * self.factor**2 // self.in_channels
x = x.repeat_interleave(repeats, dim=1)
if x.dim() == 4: # original image-only training
x = F.pixel_shuffle(x, self.factor)
elif x.dim() == 5: # [B, C, T, H, W]
if self.temporal_upsample and T != 1: # video input
x = pixel_shuffle_3d(x, self.factor)
else:
x = x.permute(0, 2, 1, 3, 4) # [B, T, C, H, W]
x = F.pixel_shuffle(x, self.factor) # on H and W only
x = x.permute(0, 2, 1, 3, 4) # [B, C, T, H, W]
return x
def __repr__(self):
return f"ChannelDuplicatingPixelShuffleUpSampleLayer(in_channels={self.in_channels}, out_channels={self.out_channels}, factor={self.factor}, temporal_upsample={self.temporal_upsample})"
class LinearLayer(nn.Module):
def __init__(
self,
in_features: int,
out_features: int,
use_bias=True,
dropout=0,
norm=None,
act_func=None,
):
super().__init__()
self.dropout = nn.Dropout(dropout, inplace=False) if dropout > 0 else None
self.linear = nn.Linear(in_features, out_features, use_bias)
self.norm = build_norm(norm, num_features=out_features)
self.act = build_act(act_func)
def _try_squeeze(self, x: torch.Tensor) -> torch.Tensor:
if x.dim() > 2:
x = torch.flatten(x, start_dim=1)
return x
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self._try_squeeze(x)
if self.dropout:
x = self.dropout(x)
x = self.linear(x)
if self.norm:
x = self.norm(x)
if self.act:
x = self.act(x)
return x
class IdentityLayer(nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
return x
#################################################################################
# Basic Blocks #
#################################################################################
class DSConv(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size=3,
stride=1,
use_bias=False,
norm=("bn2d", "bn2d"),
act_func=("relu6", None),
):
super().__init__()
use_bias = val2tuple(use_bias, 2)
norm = val2tuple(norm, 2)
act_func = val2tuple(act_func, 2)
self.depth_conv = ConvLayer(
in_channels,
in_channels,
kernel_size,
stride,
groups=in_channels,
norm=norm[0],
act_func=act_func[0],
use_bias=use_bias[0],
)
self.point_conv = ConvLayer(
in_channels,
out_channels,
1,
norm=norm[1],
act_func=act_func[1],
use_bias=use_bias[1],
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.depth_conv(x)
x = self.point_conv(x)
return x
class MBConv(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size=3,
stride=1,
mid_channels=None,
expand_ratio=6,
use_bias=False,
norm=("bn2d", "bn2d", "bn2d"),
act_func=("relu6", "relu6", None),
):
super().__init__()
use_bias = val2tuple(use_bias, 3)
norm = val2tuple(norm, 3)
act_func = val2tuple(act_func, 3)
mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
self.inverted_conv = ConvLayer(
in_channels,
mid_channels,
1,
stride=1,
norm=norm[0],
act_func=act_func[0],
use_bias=use_bias[0],
)
self.depth_conv = ConvLayer(
mid_channels,
mid_channels,
kernel_size,
stride=stride,
groups=mid_channels,
norm=norm[1],
act_func=act_func[1],
use_bias=use_bias[1],
)
self.point_conv = ConvLayer(
mid_channels,
out_channels,
1,
norm=norm[2],
act_func=act_func[2],
use_bias=use_bias[2],
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.inverted_conv(x)
x = self.depth_conv(x)
x = self.point_conv(x)
return x
class FusedMBConv(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size=3,
stride=1,
mid_channels=None,
expand_ratio=6,
groups=1,
use_bias=False,
norm=("bn2d", "bn2d"),
act_func=("relu6", None),
):
super().__init__()
use_bias = val2tuple(use_bias, 2)
norm = val2tuple(norm, 2)
act_func = val2tuple(act_func, 2)
mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
self.spatial_conv = ConvLayer(
in_channels,
mid_channels,
kernel_size,
stride,
groups=groups,
use_bias=use_bias[0],
norm=norm[0],
act_func=act_func[0],
)
self.point_conv = ConvLayer(
mid_channels,
out_channels,
1,
use_bias=use_bias[1],
norm=norm[1],
act_func=act_func[1],
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.spatial_conv(x)
x = self.point_conv(x)
return x
class GLUMBConv(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size=3,
stride=1,
mid_channels=None,
expand_ratio=6,
use_bias=False,
norm=(None, None, "ln2d"),
act_func=("silu", "silu", None),
is_video=False,
):
super().__init__()
use_bias = val2tuple(use_bias, 3)
norm = val2tuple(norm, 3)
act_func = val2tuple(act_func, 3)
mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
self.glu_act = build_act(act_func[1], inplace=False)
self.inverted_conv = ConvLayer(
in_channels,
mid_channels * 2,
1,
use_bias=use_bias[0],
norm=norm[0],
act_func=act_func[0],
is_video=is_video,
)
self.depth_conv = ConvLayer(
mid_channels * 2,
mid_channels * 2,
kernel_size,
stride=stride,
groups=mid_channels * 2,
use_bias=use_bias[1],
norm=norm[1],
act_func=None,
is_video=is_video,
)
self.point_conv = ConvLayer(
mid_channels,
out_channels,
1,
use_bias=use_bias[2],
norm=norm[2],
act_func=act_func[2],
is_video=is_video,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.inverted_conv(x)
x = self.depth_conv(x)
x, gate = torch.chunk(x, 2, dim=1)
gate = self.glu_act(gate)
x = x * gate
x = self.point_conv(x)
return x
class ResBlock(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size=3,
stride=1,
mid_channels=None,
expand_ratio=1,
use_bias=False,
norm=("bn2d", "bn2d"),
act_func=("relu6", None),
is_video=False,
):
super().__init__()
use_bias = val2tuple(use_bias, 2)
norm = val2tuple(norm, 2)
act_func = val2tuple(act_func, 2)
mid_channels = round(in_channels * expand_ratio) if mid_channels is None else mid_channels
self.conv1 = ConvLayer(
in_channels,
mid_channels,
kernel_size,
stride,
use_bias=use_bias[0],
norm=norm[0],
act_func=act_func[0],
is_video=is_video,
)
self.conv2 = ConvLayer(
mid_channels,
out_channels,
kernel_size,
1,
use_bias=use_bias[1],
norm=norm[1],
act_func=act_func[1],
is_video=is_video,
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.conv1(x)
x = self.conv2(x)
return x
class LiteMLA(nn.Module):
r"""Lightweight multi-scale linear attention"""
def __init__(
self,
in_channels: int,
out_channels: int,
heads: Optional[int] = None,
heads_ratio: float = 1.0,
dim=8,
use_bias=False,
norm=(None, "bn2d"),
act_func=(None, None),
kernel_func="relu",
scales: tuple[int, ...] = (5,),
eps=1.0e-15,
is_video=False,
):
super().__init__()
self.eps = eps
heads = int(in_channels // dim * heads_ratio) if heads is None else heads
total_dim = heads * dim
use_bias = val2tuple(use_bias, 2)
norm = val2tuple(norm, 2)
act_func = val2tuple(act_func, 2)
self.dim = dim
self.qkv = ConvLayer(
in_channels,
3 * total_dim,
1,
use_bias=use_bias[0],
norm=norm[0],
act_func=act_func[0],
is_video=is_video,
)
conv_class = nn.Conv2d if not is_video else ChannelChunkConv3d
self.aggreg = nn.ModuleList(
[
nn.Sequential(
conv_class(
3 * total_dim,
3 * total_dim,
scale,
padding=get_same_padding(scale),
groups=3 * total_dim,
bias=use_bias[0],
),
conv_class(3 * total_dim, 3 * total_dim, 1, groups=3 * heads, bias=use_bias[0]),
)
for scale in scales
]
)
self.kernel_func = build_act(kernel_func, inplace=False)
self.proj = ConvLayer(
total_dim * (1 + len(scales)),
out_channels,
1,
use_bias=use_bias[1],
norm=norm[1],
act_func=act_func[1],
is_video=is_video,
)
@torch.autocast(device_type="cuda", enabled=False)
def relu_linear_att(self, qkv: torch.Tensor) -> torch.Tensor:
if qkv.ndim == 5:
B, _, T, H, W = list(qkv.size())
is_video = True
else:
B, _, H, W = list(qkv.size())
is_video = False
if qkv.dtype == torch.float16:
qkv = qkv.float()
if qkv.ndim == 4:
qkv = torch.reshape(
qkv,
(
B,
-1,
3 * self.dim,
H * W,
),
)
elif qkv.ndim == 5:
qkv = torch.reshape(
qkv,
(
B,
-1,
3 * self.dim,
H * W * T,
),
)
q, k, v = (
qkv[:, :, 0 : self.dim],
qkv[:, :, self.dim : 2 * self.dim],
qkv[:, :, 2 * self.dim :],
)
# lightweight linear attention
q = self.kernel_func(q)
k = self.kernel_func(k)
# linear matmul
trans_k = k.transpose(-1, -2)
v = F.pad(v, (0, 0, 0, 1), mode="constant", value=1)
vk = torch.matmul(v, trans_k)
out = torch.matmul(vk, q)
if out.dtype == torch.bfloat16:
out = out.float()
out = out[:, :, :-1] / (out[:, :, -1:] + self.eps)
if not is_video:
out = torch.reshape(out, (B, -1, H, W))
else:
out = torch.reshape(out, (B, -1, T, H, W))
return out
@torch.autocast(device_type="cuda", enabled=False)
def relu_quadratic_att(self, qkv: torch.Tensor) -> torch.Tensor:
B, _, H, W = list(qkv.size())
qkv = torch.reshape(
qkv,
(
B,
-1,
3 * self.dim,
H * W,
),
)
q, k, v = (
qkv[:, :, 0 : self.dim],
qkv[:, :, self.dim : 2 * self.dim],
qkv[:, :, 2 * self.dim :],
)
q = self.kernel_func(q)
k = self.kernel_func(k)
att_map = torch.matmul(k.transpose(-1, -2), q) # b h n n
original_dtype = att_map.dtype
if original_dtype in [torch.float16, torch.bfloat16]:
att_map = att_map.float()
att_map = att_map / (torch.sum(att_map, dim=2, keepdim=True) + self.eps) # b h n n
att_map = att_map.to(original_dtype)
out = torch.matmul(v, att_map) # b h d n
out = torch.reshape(out, (B, -1, H, W))
return out
def forward(self, x: torch.Tensor) -> torch.Tensor:
# generate multi-scale q, k, v
qkv = self.qkv(x)
multi_scale_qkv = [qkv]
for op in self.aggreg:
multi_scale_qkv.append(op(qkv))
qkv = torch.cat(multi_scale_qkv, dim=1)
if qkv.ndim == 4:
H, W = list(qkv.size())[-2:]
# num_tokens = H * W
elif qkv.ndim == 5:
_, _, T, H, W = list(qkv.size())
# num_tokens = H * W * T
# if num_tokens > self.dim:
out = self.relu_linear_att(qkv).to(qkv.dtype)
# else:
# if self.is_video:
# raise NotImplementedError("Video is not supported for quadratic attention")
# out = self.relu_quadratic_att(qkv)
out = self.proj(out)
return out
class EfficientViTBlock(nn.Module):
def __init__(
self,
in_channels: int,
heads_ratio: float = 1.0,
dim=32,
expand_ratio: float = 4,
scales: tuple[int, ...] = (5,),
norm: str = "bn2d",
act_func: str = "hswish",
context_module: str = "LiteMLA",
local_module: str = "MBConv",
is_video: bool = False,
):
super().__init__()
if context_module == "LiteMLA":
self.context_module = ResidualBlock(
LiteMLA(
in_channels=in_channels,
out_channels=in_channels,
heads_ratio=heads_ratio,
dim=dim,
norm=(None, norm),
scales=scales,
is_video=is_video,
),
IdentityLayer(),
)
else:
raise ValueError(f"context_module {context_module} is not supported")
if local_module == "MBConv":
self.local_module = ResidualBlock(
MBConv(
in_channels=in_channels,
out_channels=in_channels,
expand_ratio=expand_ratio,
use_bias=(True, True, False),
norm=(None, None, norm),
act_func=(act_func, act_func, None),
is_video=is_video,
),
IdentityLayer(),
)
elif local_module == "GLUMBConv":
self.local_module = ResidualBlock(
GLUMBConv(
in_channels=in_channels,
out_channels=in_channels,
expand_ratio=expand_ratio,
use_bias=(True, True, False),
norm=(None, None, norm),
act_func=(act_func, act_func, None),
is_video=is_video,
),
IdentityLayer(),
)
else:
raise NotImplementedError(f"local_module {local_module} is not supported")
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.context_module(x)
x = self.local_module(x)
return x
#################################################################################
# Functional Blocks #
#################################################################################
class ResidualBlock(nn.Module):
def __init__(
self,
main: Optional[nn.Module],
shortcut: Optional[nn.Module],
post_act=None,
pre_norm: Optional[nn.Module] = None,
):
super().__init__()
self.pre_norm = pre_norm
self.main = main
self.shortcut = shortcut
self.post_act = build_act(post_act)
def forward_main(self, x: torch.Tensor) -> torch.Tensor:
if self.pre_norm is None:
return self.main(x)
else:
return self.main(self.pre_norm(x))
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.main is None:
res = x
elif self.shortcut is None:
res = self.forward_main(x)
else:
res = self.forward_main(x) + self.shortcut(x)
if self.post_act:
res = self.post_act(res)
return res
class DAGBlock(nn.Module):
def __init__(
self,
inputs: dict[str, nn.Module],
merge: str,
post_input: Optional[nn.Module],
middle: nn.Module,
outputs: dict[str, nn.Module],
):
super().__init__()
self.input_keys = list(inputs.keys())
self.input_ops = nn.ModuleList(list(inputs.values()))
self.merge = merge
self.post_input = post_input
self.middle = middle
self.output_keys = list(outputs.keys())
self.output_ops = nn.ModuleList(list(outputs.values()))
def forward(self, feature_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
feat = [op(feature_dict[key]) for key, op in zip(self.input_keys, self.input_ops)]
if self.merge == "add":
feat = list_sum(feat)
elif self.merge == "cat":
feat = torch.concat(feat, dim=1)
else:
raise NotImplementedError
if self.post_input is not None:
feat = self.post_input(feat)
feat = self.middle(feat)
for key, op in zip(self.output_keys, self.output_ops):
feature_dict[key] = op(feat)
return feature_dict
class OpSequential(nn.Module):
def __init__(self, op_list: list[Optional[nn.Module]]):
super().__init__()
valid_op_list = []
for op in op_list:
if op is not None:
valid_op_list.append(op)
self.op_list = nn.ModuleList(valid_op_list)
def forward(self, x: torch.Tensor) -> torch.Tensor:
for op in self.op_list:
x = op(x)
return x

View File

@ -0,0 +1,244 @@
import math
from inspect import signature
from typing import Any, Callable, Optional, Union
import torch
import torch.nn.functional as F
VERBOSE = False
def pixel_shuffle_3d(x, upscale_factor):
"""
3D pixelshuffle 操作
"""
B, C, T, H, W = x.shape
r = upscale_factor
assert C % (r * r * r) == 0, "通道数必须是上采样因子的立方倍数"
C_new = C // (r * r * r)
x = x.view(B, C_new, r, r, r, T, H, W)
if VERBOSE:
print("x.view:")
print(x)
print("x.view.shape:")
print(x.shape)
x = x.permute(0, 1, 5, 2, 6, 3, 7, 4)
if VERBOSE:
print("x.permute:")
print(x)
print("x.permute.shape:")
print(x.shape)
y = x.reshape(B, C_new, T * r, H * r, W * r)
return y
def pixel_unshuffle_3d(x, downsample_factor):
"""
3D pixel unshuffle 操作
"""
B, C, T, H, W = x.shape
r = downsample_factor
assert T % r == 0, f"时间维度必须是下采样因子的倍数, got shape {x.shape}"
assert H % r == 0, f"高度维度必须是下采样因子的倍数, got shape {x.shape}"
assert W % r == 0, f"宽度维度必须是下采样因子的倍数, got shape {x.shape}"
T_new = T // r
H_new = H // r
W_new = W // r
C_new = C * (r * r * r)
x = x.view(B, C, T_new, r, H_new, r, W_new, r)
x = x.permute(0, 1, 3, 5, 7, 2, 4, 6)
y = x.reshape(B, C_new, T_new, H_new, W_new)
return y
def test_pixel_shuffle_3d():
# 输入张量 (B, C, T, H, W) = (1, 16, 2, 4, 4)
x = torch.arange(1, 1 + 1 * 16 * 2 * 4 * 4).view(1, 16, 2, 4, 4).float()
print("x:")
print(x)
print("x.shape:")
print(x.shape)
upscale_factor = 2
# 使用自定义 pixelshuffle_3d
y = pixel_shuffle_3d(x, upscale_factor)
print("pixelshuffle_3d 结果:")
print(y)
print("输出形状:", y.shape)
# 预期输出形状: (1, 1, 4, 8, 8)
# 因为:
# - 通道数从8变为1 (8 /(2*2*2))
# - 时间维度从2变为4 (2*2)
# - 高度从4变为8 (4*2)
# - 宽度从4变为8 (4*2)
print(torch.allclose(x, pixel_unshuffle_3d(y, upscale_factor)))
def chunked_interpolate(x, scale_factor, mode="nearest"):
"""
Interpolate large tensors by chunking along the channel dimension. https://discuss.pytorch.org/t/error-using-f-interpolate-for-large-3d-input/207859
Only supports 'nearest' interpolation mode.
Args:
x (torch.Tensor): Input tensor (B, C, D, H, W)
scale_factor: Tuple of scaling factors (d, h, w)
Returns:
torch.Tensor: Interpolated tensor
"""
assert (
mode == "nearest"
), "Only the nearest mode is supported" # actually other modes are theoretically supported but not tested
if len(x.shape) != 5:
raise ValueError("Expected 5D input tensor (B, C, D, H, W)")
# Calculate max chunk size to avoid int32 overflow. num_elements < max_int32
# Max int32 is 2^31 - 1
max_elements_per_chunk = 2**31 - 1
# Calculate output spatial dimensions
out_d = math.ceil(x.shape[2] * scale_factor[0])
out_h = math.ceil(x.shape[3] * scale_factor[1])
out_w = math.ceil(x.shape[4] * scale_factor[2])
# Calculate max channels per chunk to stay under limit
elements_per_channel = out_d * out_h * out_w
max_channels = max_elements_per_chunk // (x.shape[0] * elements_per_channel)
# Use smaller of max channels or input channels
chunk_size = min(max_channels, x.shape[1])
# Ensure at least 1 channel per chunk
chunk_size = max(1, chunk_size)
if VERBOSE:
print(f"Input channels: {x.shape[1]}")
print(f"Chunk size: {chunk_size}")
print(f"max_channels: {max_channels}")
print(f"num_chunks: {math.ceil(x.shape[1] / chunk_size)}")
chunks = []
for i in range(0, x.shape[1], chunk_size):
start_idx = i
end_idx = min(i + chunk_size, x.shape[1])
chunk = x[:, start_idx:end_idx, :, :, :]
interpolated_chunk = F.interpolate(chunk, scale_factor=scale_factor, mode="nearest")
chunks.append(interpolated_chunk)
if not chunks:
raise ValueError(f"No chunks were generated. Input shape: {x.shape}")
# Concatenate chunks along channel dimension
return torch.cat(chunks, dim=1)
def test_chunked_interpolate():
# Test case 1: Basic upscaling with scale_factor
x1 = torch.randn(2, 16, 16, 32, 32).cuda()
scale_factor = (2.0, 2.0, 2.0)
assert torch.allclose(
chunked_interpolate(x1, scale_factor=scale_factor), F.interpolate(x1, scale_factor=scale_factor, mode="nearest")
)
# Test case 3: Downscaling with scale_factor
x3 = torch.randn(2, 16, 32, 64, 64).cuda()
scale_factor = (0.5, 0.5, 0.5)
assert torch.allclose(
chunked_interpolate(x3, scale_factor=scale_factor), F.interpolate(x3, scale_factor=scale_factor, mode="nearest")
)
# Test case 4: Different scales per dimension
x4 = torch.randn(2, 16, 16, 32, 32).cuda()
scale_factor = (2.0, 1.5, 1.5)
assert torch.allclose(
chunked_interpolate(x4, scale_factor=scale_factor), F.interpolate(x4, scale_factor=scale_factor, mode="nearest")
)
# Test case 5: Large input tensor
x5 = torch.randn(2, 16, 64, 128, 128).cuda()
scale_factor = (2.0, 2.0, 2.0)
assert torch.allclose(
chunked_interpolate(x5, scale_factor=scale_factor), F.interpolate(x5, scale_factor=scale_factor, mode="nearest")
)
# Test case 7: Chunk size equal to input depth
x7 = torch.randn(2, 16, 8, 32, 32).cuda()
scale_factor = (2.0, 2.0, 2.0)
assert torch.allclose(
chunked_interpolate(x7, scale_factor=scale_factor), F.interpolate(x7, scale_factor=scale_factor, mode="nearest")
)
# Test case 8: Single channel input
x8 = torch.randn(2, 1, 16, 32, 32).cuda()
scale_factor = (2.0, 2.0, 2.0)
assert torch.allclose(
chunked_interpolate(x8, scale_factor=scale_factor), F.interpolate(x8, scale_factor=scale_factor, mode="nearest")
)
# Test case 9: Minimal batch size
x9 = torch.randn(1, 16, 32, 64, 64).cuda()
scale_factor = (0.5, 0.5, 0.5)
assert torch.allclose(
chunked_interpolate(x9, scale_factor=scale_factor), F.interpolate(x9, scale_factor=scale_factor, mode="nearest")
)
# Test case 10: Non-power-of-2 dimensions
x10 = torch.randn(2, 16, 15, 31, 31).cuda()
scale_factor = (2.0, 2.0, 2.0)
assert torch.allclose(
chunked_interpolate(x10, scale_factor=scale_factor),
F.interpolate(x10, scale_factor=scale_factor, mode="nearest"),
)
# Test case 11: large output tensor
def get_same_padding(kernel_size: Union[int, tuple[int, ...]]) -> Union[int, tuple[int, ...]]:
if isinstance(kernel_size, tuple):
return tuple([get_same_padding(ks) for ks in kernel_size])
else:
assert kernel_size % 2 > 0, "kernel size should be odd number"
return kernel_size // 2
def resize(
x: torch.Tensor,
size: Optional[Any] = None,
scale_factor: Optional[list[float]] = None,
mode: str = "bicubic",
align_corners: Optional[bool] = False,
) -> torch.Tensor:
if mode in {"bilinear", "bicubic"}:
return F.interpolate(
x,
size=size,
scale_factor=scale_factor,
mode=mode,
align_corners=align_corners,
)
elif mode in {"nearest", "area"}:
return F.interpolate(x, size=size, scale_factor=scale_factor, mode=mode)
else:
raise NotImplementedError(f"resize(mode={mode}) not implemented.")
def build_kwargs_from_config(config: dict, target_func: Callable) -> dict[str, Any]:
valid_keys = list(signature(target_func).parameters)
kwargs = {}
for key in config:
if key in valid_keys:
kwargs[key] = config[key]
return kwargs
if __name__ == "__main__":
test_chunked_interpolate()

View File

@ -0,0 +1,3 @@
from .init import *
from .list import *

View File

@ -0,0 +1,63 @@
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
from typing import Union
import torch
import torch.nn as nn
from torch.nn.modules.batchnorm import _BatchNorm
__all__ = ["init_modules"]
def init_modules(model: Union[nn.Module, list[nn.Module]], init_type="trunc_normal") -> None:
_DEFAULT_INIT_PARAM = {"trunc_normal": 0.02}
if isinstance(model, list):
for sub_module in model:
init_modules(sub_module, init_type)
else:
init_params = init_type.split("@")
init_params = float(init_params[1]) if len(init_params) > 1 else None
if init_type.startswith("trunc_normal"):
init_func = lambda param: nn.init.trunc_normal_(
param, std=(_DEFAULT_INIT_PARAM["trunc_normal"] if init_params is None else init_params)
)
elif init_type.startswith("normal"):
init_func = lambda param: nn.init.normal_(
param, std=(_DEFAULT_INIT_PARAM["trunc_normal"] if init_params is None else init_params)
)
else:
raise NotImplementedError
for m in model.modules():
if isinstance(m, (nn.Conv2d, nn.Linear, nn.ConvTranspose2d)):
init_func(m.weight)
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.Embedding):
init_func(m.weight)
elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
m.weight.data.fill_(1)
m.bias.data.zero_()
else:
weight = getattr(m, "weight", None)
bias = getattr(m, "bias", None)
if isinstance(weight, torch.nn.Parameter):
init_func(weight)
if isinstance(bias, torch.nn.Parameter):
bias.data.zero_()

View File

@ -0,0 +1,68 @@
# Copyright 2024 MIT Han Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Optional, Union
__all__ = [
"list_sum",
"list_mean",
"weighted_list_sum",
"list_join",
"val2list",
"val2tuple",
"squeeze_list",
]
def list_sum(x: list) -> Any:
return x[0] if len(x) == 1 else x[0] + list_sum(x[1:])
def list_mean(x: list) -> Any:
return list_sum(x) / len(x)
def weighted_list_sum(x: list, weights: list) -> Any:
assert len(x) == len(weights)
return x[0] * weights[0] if len(x) == 1 else x[0] * weights[0] + weighted_list_sum(x[1:], weights[1:])
def list_join(x: list, sep="\t", format_str="%s") -> str:
return sep.join([format_str % val for val in x])
def val2list(x: Union[list, tuple, Any], repeat_time=1) -> list:
if isinstance(x, (list, tuple)):
return list(x)
return [x for _ in range(repeat_time)]
def val2tuple(x: Union[list, tuple, Any], min_len: int = 1, idx_repeat: int = -1) -> tuple:
x = val2list(x)
# repeat elements if necessary
if len(x) > 0:
x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
return tuple(x)
def squeeze_list(x: Optional[list]) -> Union[list, Any]:
if x is not None and len(x) == 1:
return x[0]
else:
return x

View File

@ -0,0 +1,5 @@
from pathlib import Path
import torch
from .autoencoder_kl_causal_3d import CausalVAE3D_HUNYUAN

Some files were not shown because too many files have changed in this diff Show More