Remove prints.

Remove print.
Add LTX-Video support to the Readme.
2024-11-22 10:51:31 -05:00 · 2024-11-22 10:49:15 -05:00 · 2024-11-22 09:24:20 -05:00 · 2024-11-22 08:46:39 -05:00 · 2024-11-22 02:10:09 -05:00 · 2024-11-21 08:38:23 -05:00
336 changed files with 208279 additions and 44346 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -62,12 +62,38 @@ except:

 print("checking out master branch")
 branch = repo.lookup_branch('master')
-ref = repo.lookup_reference(branch.name)
-repo.checkout(ref)
+if branch is None:
+    ref = repo.lookup_reference('refs/remotes/origin/master')
+    repo.checkout(ref)
+    branch = repo.lookup_branch('master')
+    if branch is None:
+        repo.create_branch('master', repo.get(ref.target))
+else:
+    ref = repo.lookup_reference(branch.name)
+    repo.checkout(ref)

 print("pulling latest changes")
 pull(repo)

+if "--stable" in sys.argv:
+    def latest_tag(repo):
+        versions = []
+        for k in repo.references:
+            try:
+                prefix = "refs/tags/v"
+                if k.startswith(prefix):
+                    version = list(map(int, k[len(prefix):].split(".")))
+                    versions.append((version[0] * 10000000000 + version[1] * 100000 + version[2], k))
+            except:
+                pass
+        versions.sort()
+        if len(versions) > 0:
+            return versions[-1][1]
+        return None
+    latest_tag = latest_tag(repo)
+    if latest_tag is not None:
+        repo.checkout(latest_tag)
+
 print("Done!")

 self_update = True
@@ -108,3 +134,13 @@ if not os.path.exists(req_path) or not files_equal(repo_req_path, req_path):
        shutil.copy(repo_req_path, req_path)
    except:
        pass
+
+
+stable_update_script = os.path.join(repo_path, ".ci/update_windows/update_comfyui_stable.bat")
+stable_update_script_to = os.path.join(cur_path, "update_comfyui_stable.bat")
+
+try:
+    if not file_size(stable_update_script_to) > 10:
+        shutil.copy(stable_update_script, stable_update_script_to)
+except:
+    pass
--- a/.ci/update_windows/update_comfyui_stable.bat
+++ b/.ci/update_windows/update_comfyui_stable.bat
@@ -0,0 +1,8 @@
+@echo off
+..\python_embeded\python.exe .\update.py ..\ComfyUI\ --stable
+if exist update_new.py (
+  move /y update_new.py update.py
+  echo Running updater again since it got updated.
+  ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --skip_self_update --stable
+)
+if "%~1"=="" pause
--- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
@@ -14,7 +14,7 @@ run_cpu.bat

 IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints

-You can download the stable diffusion 1.5 one from: https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt
+You can download the stable diffusion 1.5 one from: https://huggingface.co/Comfy-Org/stable-diffusion-v1-5-archive/blob/main/v1-5-pruned-emaonly-fp16.safetensors


 RECOMMENDED WAY TO UPDATE:
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
@@ -0,0 +1,2 @@
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast
+pause
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+/web/assets/** linguist-generated
+/web/** linguist-vendored
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,8 @@
 blank_issues_enabled: true
 contact_links:
+  - name: ComfyUI Frontend Issues
+    url: https://github.com/Comfy-Org/ComfyUI_frontend/issues
+    about: Issues related to the ComfyUI frontend (display issues, user interaction bugs), please go to the frontend repo to file the issue
  - name: ComfyUI Matrix Space
    url: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
    about: The ComfyUI Matrix Space is available for support and general discussion related to ComfyUI (Matrix is like Discord but open source).
--- a/.github/workflows/pullrequest-ci-run.yml
+++ b/.github/workflows/pullrequest-ci-run.yml
@@ -0,0 +1,53 @@
+# This is the GitHub Workflow that drives full-GPU-enabled tests of pull requests to ComfyUI, when the 'Run-CI-Test' label is added
+# Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
+name: Pull Request CI Workflow Runs
+on:
+    pull_request_target:
+        types: [labeled]
+
+jobs:
+  pr-test-stable:
+    if: ${{ github.event.label.name == 'Run-CI-Test' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos, linux, windows]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        cuda_version: ["12.1"]
+        torch_version: ["stable"]
+        include:
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
+          - os: linux
+            runner_label: [self-hosted, Linux]
+            flags: ""
+          - os: windows
+            runner_label: [self-hosted, Windows]
+            flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}
+          use_prior_commit: 'true'
+  comment:
+    if: ${{ github.event.label.name == 'Run-CI-Test' }}
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: '(Automated Bot Message) CI Tests are running, you can view the results at https://ci.comfy.org/?branch=${{ github.event.pull_request.number }}%2Fmerge'
+            })
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,23 @@
+name: Python Linting
+
+on: [push, pull_request]
+
+jobs:
+  pylint:
+    name: Run Pylint
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.x
+
+    - name: Install Pylint
+      run: pip install pylint
+
+    - name: Run Pylint
+      run: pylint --rcfile=.pylintrc $(find . -type f -name "*.py")
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -2,9 +2,28 @@
 name: "Release Stable Version"

 on:
-  push:
-    tags:
-      - 'v*'
+  workflow_dispatch:
+    inputs:
+      git_tag:
+        description: 'Git tag'
+        required: true
+        type: string
+      cu:
+        description: 'CUDA version'
+        required: true
+        type: string
+        default: "124"
+      python_minor:
+        description: 'Python minor version'
+        required: true
+        type: string
+        default: "12"
+      python_patch:
+        description: 'Python patch version'
+        required: true
+        type: string
+        default: "7"
+

 jobs:
  package_comfy_windows:
@@ -13,69 +32,44 @@ jobs:
      packages: "write"
      pull-requests: "read"
    runs-on: windows-latest
-    strategy:
-      matrix:
-        python_version: [3.11.8]
-        cuda_version: [121]
    steps:
-      - name: Calculate Minor Version
-        shell: bash
-        run: |
-          # Extract the minor version from the Python version
-          MINOR_VERSION=$(echo "${{ matrix.python_version }}" | cut -d'.' -f2)
-          echo "MINOR_VERSION=$MINOR_VERSION" >> $GITHUB_ENV
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-        
      - uses: actions/checkout@v4
        with:
+          ref: ${{ inputs.git_tag }}
          fetch-depth: 0
          persist-credentials: false
+      - uses: actions/cache/restore@v4
+        id: cache
+        with:
+          path: |
+            cu${{ inputs.cu }}_python_deps.tar
+            update_comfyui_and_python_dependencies.bat
+          key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
      - shell: bash
        run: |
-          echo "@echo off
-          call update_comfyui.bat nopause
-          echo -
-          echo This will try to update pytorch and all python dependencies.
-          echo -
-          echo If you just want to update normally, close this and run update_comfyui.bat instead.
-          echo -
-          pause
-          ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r ../ComfyUI/requirements.txt pygit2
-          pause" > update_comfyui_and_python_dependencies.bat
-
-          python -m pip wheel --no-cache-dir torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r requirements.txt pygit2 -w ./temp_wheel_dir
-          python -m pip install --no-cache-dir ./temp_wheel_dir/*
-          echo installed basic
-          ls -lah temp_wheel_dir
-          mv temp_wheel_dir cu${{ matrix.cuda_version }}_python_deps
-          mv cu${{ matrix.cuda_version }}_python_deps ../
+          mv cu${{ inputs.cu }}_python_deps.tar ../
          mv update_comfyui_and_python_dependencies.bat ../
          cd ..
+          tar xf cu${{ inputs.cu }}_python_deps.tar
          pwd
          ls
-          
+
+      - shell: bash
+        run: |
+          cd ..
          cp -r ComfyUI ComfyUI_copy
-          curl https://www.python.org/ftp/python/${{ matrix.python_version }}/python-${{ matrix.python_version }}-embed-amd64.zip -o python_embeded.zip
+          curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
          unzip python_embeded.zip -d python_embeded
          cd python_embeded
          echo ${{ env.MINOR_VERSION }}
-          echo 'import site' >> ./python3${{ env.MINOR_VERSION }}._pth
+          echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
          curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
          ./python.exe get-pip.py
-          ./python.exe --version
-          echo "Pip version:"
-          ./python.exe -m pip --version
+          ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
+            cd ..

-          set PATH=$PWD/Scripts:$PATH
-          echo $PATH
-          ./python.exe -s -m pip install ../cu${{ matrix.cuda_version }}_python_deps/*
-          sed -i '1i../ComfyUI' ./python3${{ env.MINOR_VERSION }}._pth
-          cd ..
-
-          git clone https://github.com/comfyanonymous/taesd
+          git clone --depth 1 https://github.com/comfyanonymous/taesd
          cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

          mkdir ComfyUI_windows_portable
@@ -104,6 +98,7 @@ jobs:
        with:
          repo_token: ${{ secrets.GITHUB_TOKEN }}
          file: ComfyUI_windows_portable_nvidia.7z
-          tag: ${{ github.ref }}
+          tag: ${{ inputs.git_tag }}
          overwrite: true
-        
+          prerelease: true
+          make_latest: false
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -0,0 +1,21 @@
+name: 'Close stale issues'
+on:
+  schedule:
+    # Run daily at 430 am PT
+    - cron: '30 11 * * *'
+permissions:
+  issues: write
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v9
+        with:
+          stale-issue-message: "This issue is being marked stale because it has not had any activity for 30 days. Reply below within 7 days if your issue still isn't solved, and it will be left open. Otherwise, the issue will be closed automatically."
+          days-before-stale: 30
+          days-before-close: 7
+          stale-issue-label: 'Stale'
+          only-labels: 'User Support'
+          exempt-all-assignees: true
+          exempt-all-milestones: true
--- a/.github/workflows/test-browser.yml
+++ b/.github/workflows/test-browser.yml
@@ -1,76 +0,0 @@
-# This is a temporary action during frontend TS migration.
-# This file should be removed after TS migration is completed.
-# The browser test is here to ensure TS repo is working the same way as the
-# current JS code.
-# If you are adding UI feature, please sync your changes to the TS repo:
-# huchenlei/ComfyUI_frontend and update test expectation files accordingly.
-name: Playwright Browser Tests CI
-
-on:
-  push:
-    branches: [ main, master ]
-  pull_request:
-    branches: [ main, master ]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout ComfyUI
-      uses: actions/checkout@v4
-      with:
-        repository: "comfyanonymous/ComfyUI"
-        path: "ComfyUI"
-    - name: Checkout ComfyUI_frontend
-      uses: actions/checkout@v4
-      with:
-        repository: "huchenlei/ComfyUI_frontend"
-        path: "ComfyUI_frontend"
-        ref: "fcc54d803e5b6a9b08a462a1d94899318c96dcbb"
-    - uses: actions/setup-node@v3
-      with:
-        node-version: lts/*
-    - uses: actions/setup-python@v4
-      with:
-        python-version: '3.10'
-    - name: Install requirements
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install -r requirements.txt
-        pip install wait-for-it
-      working-directory: ComfyUI
-    - name: Start ComfyUI server
-      run: |
-        python main.py --cpu 2>&1 | tee console_output.log &
-        wait-for-it --service 127.0.0.1:8188 -t 600
-      working-directory: ComfyUI
-    - name: Install ComfyUI_frontend dependencies
-      run: |
-        npm ci
-      working-directory: ComfyUI_frontend
-    - name: Install Playwright Browsers
-      run: npx playwright install --with-deps
-      working-directory: ComfyUI_frontend
-    - name: Run Playwright tests
-      run: npx playwright test
-      working-directory: ComfyUI_frontend
-    - name: Check for unhandled exceptions in server log
-      run: |
-        if grep -qE "Exception|Error" console_output.log; then
-          echo "Unhandled exception/error found in server log."
-          exit 1
-        fi
-      working-directory: ComfyUI
-    - uses: actions/upload-artifact@v4
-      if: always()
-      with:
-        name: playwright-report
-        path: ComfyUI_frontend/playwright-report/
-        retention-days: 30
-    - uses: actions/upload-artifact@v4
-      if: always()
-      with:
-        name: console-output
-        path: ComfyUI/console_output.log
-        retention-days: 30
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@@ -0,0 +1,95 @@
+# This is the GitHub Workflow that drives automatic full-GPU-enabled tests of all new commits to the master branch of ComfyUI
+# Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
+name: Full Comfy CI Workflow Runs
+on:
+  push:
+    branches:
+      - master
+    paths-ignore:
+      - 'app/**'
+      - 'input/**'
+      - 'output/**'
+      - 'notebooks/**'
+      - 'script_examples/**'
+      - '.github/**'
+      - 'web/**'
+  workflow_dispatch:
+
+jobs:
+  test-stable:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos, linux, windows]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        cuda_version: ["12.1"]
+        torch_version: ["stable"]
+        include:
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
+          - os: linux
+            runner_label: [self-hosted, Linux]
+            flags: ""
+          - os: windows
+            runner_label: [self-hosted, Windows]
+            flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}
+
+  test-win-nightly:
+    strategy:
+      fail-fast: true
+      matrix:
+        os: [windows]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        cuda_version: ["12.1"]
+        torch_version: ["nightly"]
+        include:
+          - os: windows
+            runner_label: [self-hosted, Windows]
+            flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}
+
+  test-unix-nightly:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos, linux]
+        python_version: ["3.11"]
+        cuda_version: ["12.1"]
+        torch_version: ["nightly"]
+        include:
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
+          - os: linux
+            runner_label: [self-hosted, Linux]
+            flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -0,0 +1,45 @@
+name: Test server launches without errors
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout ComfyUI
+      uses: actions/checkout@v4
+      with:
+        repository: "comfyanonymous/ComfyUI"
+        path: "ComfyUI"
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+        pip install wait-for-it
+      working-directory: ComfyUI
+    - name: Start ComfyUI server
+      run: |
+        python main.py --cpu 2>&1 | tee console_output.log &
+        wait-for-it --service 127.0.0.1:8188 -t 600
+      working-directory: ComfyUI
+    - name: Check for unhandled exceptions in server log
+      run: |
+        if grep -qE "Exception|Error" console_output.log; then
+          echo "Unhandled exception/error found in server log."
+          exit 1
+        fi
+      working-directory: ComfyUI
+    - uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: console-output
+        path: ComfyUI/console_output.log
+        retention-days: 30
--- a/.github/workflows/test-ui.yaml
+++ b/.github/workflows/test-ui.yaml
@@ -1,26 +0,0 @@
-name: Tests CI
-
-on: [push, pull_request]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-node@v3
-      with:
-        node-version: 18
-    - uses: actions/setup-python@v4
-      with: 
-        python-version: '3.10'
-    - name: Install requirements
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install -r requirements.txt
-    - name: Run Tests
-      run: | 
-        npm ci
-        npm run test:generate
-        npm test -- --verbose
-      working-directory: ./tests-ui
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@@ -0,0 +1,30 @@
+name: Unit Tests
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    continue-on-error: true
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python      
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+    - name: Run Unit Tests
+      run: |
+        pip install -r tests-unit/requirements.txt
+        python -m pytest tests-unit
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -8,23 +8,28 @@ on:
        required: false
        type: string
        default: ""
+      extra_dependencies:
+        description: 'extra dependencies'
+        required: false
+        type: string
+        default: ""
      cu:
        description: 'cuda version'
        required: true
        type: string
-        default: "121"
+        default: "124"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "11"
+        default: "12"

      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "8"
+        default: "7"
 #  push:
 #    branches:
 #      - master
@@ -51,7 +56,7 @@ jobs:
            ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
            pause" > update_comfyui_and_python_dependencies.bat

-            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
            python -m pip install --no-cache-dir ./temp_wheel_dir/*
            echo installed basic
            ls -lah temp_wheel_dir
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "3"
+        default: "4"
 #  push:
 #    branches:
 #      - master
@@ -49,13 +49,13 @@ jobs:
            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
            ./python.exe get-pip.py
-            python -m pip wheel torch torchvision torchaudio mpmath==1.3.0 numpy==1.26.4 --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
            ls ../temp_wheel_dir
            ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            cd ..

-            git clone https://github.com/comfyanonymous/taesd
+            git clone --depth 1 https://github.com/comfyanonymous/taesd
            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable_nightly_pytorch
@@ -67,6 +67,7 @@ jobs:
            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
            cp -r ComfyUI/.ci/windows_base_files/* ./
+            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./

            echo "call update_comfyui.bat nopause
            ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -7,19 +7,19 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "121"
+        default: "124"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "11"
+        default: "12"

      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "8"
+        default: "7"
 #  push:
 #    branches:
 #      - master
@@ -66,7 +66,7 @@ jobs:
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            cd ..

-            git clone https://github.com/comfyanonymous/taesd
+            git clone --depth 1 https://github.com/comfyanonymous/taesd
            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable
--- a/.gitignore
+++ b/.gitignore
@@ -12,9 +12,12 @@ extra_model_paths.yaml
 .vscode/
 .idea/
 venv/
+.venv/
 /web/extensions/*
 !/web/extensions/logging.js.example
 !/web/extensions/core/
 /tests-ui/data/object_info.json
 /user/
-*.log
+*.log
+web_custom_versions/
+.DS_Store
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,3 @@
+[MESSAGES CONTROL]
+disable=all
+enable=eval-used
--- a/README.md
+++ b/README.md
@@ -1,8 +1,35 @@
-ComfyUI
-=======
-The most powerful and modular stable diffusion GUI and backend.
-----------
-![ComfyUI Screenshot](comfyui_screenshot.png)
+<div align="center">
+
+# ComfyUI
+**The most powerful and modular diffusion model GUI and backend.**
+
+
+[![Website][website-shield]][website-url]
+[![Dynamic JSON Badge][discord-shield]][discord-url]
+[![Matrix][matrix-shield]][matrix-url]
+<br>
+[![][github-release-shield]][github-release-link]
+[![][github-release-date-shield]][github-release-link]
+[![][github-downloads-shield]][github-downloads-link]
+[![][github-downloads-latest-shield]][github-downloads-link]
+
+[matrix-shield]: https://img.shields.io/badge/Matrix-000000?style=flat&logo=matrix&logoColor=white
+[matrix-url]: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
+[website-shield]: https://img.shields.io/badge/ComfyOrg-4285F4?style=flat
+[website-url]: https://www.comfy.org/
+<!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
+[discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
+[discord-url]: https://www.comfy.org/discord
+
+[github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
+[github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
+[github-release-date-shield]: https://img.shields.io/github/release-date/comfyanonymous/ComfyUI?style=flat
+[github-downloads-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/total?style=flat
+[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
+[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
+
+![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
+</div>

 This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
 ### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
@@ -12,6 +39,9 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
+- [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
+- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
+- [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@@ -32,6 +62,8 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
 - [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
+- [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
+- [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
@@ -45,6 +77,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 |------------------------------------|--------------------------------------------------------------------------------------------------------------------|
 | Ctrl + Enter                       | Queue up current graph for generation                                                                              |
 | Ctrl + Shift + Enter               | Queue up current graph as first for generation                                                                     |
+| Ctrl + Alt + Enter                 | Cancel current generation                                                                                          |
 | Ctrl + Z/Ctrl + Y                  | Undo/Redo                                                                                                          |
 | Ctrl + S                           | Save workflow                                                                                                      |
 | Ctrl + O                           | Load workflow                                                                                                      |
@@ -63,10 +96,14 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 | Alt + `+`                          | Canvas Zoom in                                                                                                     |
 | Alt + `-`                          | Canvas Zoom out                                                                                                    |
 | Ctrl + Shift + LMB + Vertical drag | Canvas Zoom in/out                                                                                                 |
+| P                                  | Pin/Unpin selected nodes                                                                                           |
+| Ctrl + G                           | Group selected nodes                                                                                               |
 | Q                                  | Toggle visibility of the queue                                                                                     |
 | H                                  | Toggle visibility of history                                                                                       |
 | R                                  | Refresh graph                                                                                                      |
 | Double-Click LMB                   | Open node quick search palette                                                                                     |
+| Shift + Drag                       | Move multiple wires at once                                                                                        |
+| Ctrl + Alt + LMB                   | Disconnect all wires from clicked slot                                                                             |

 Ctrl can also be replaced with Cmd instead for macOS users

@@ -76,7 +113,7 @@ Ctrl can also be replaced with Cmd instead for macOS users

 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

-### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu121_or_cpu.7z)
+### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)

 Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints

@@ -92,6 +129,8 @@ To run it on services like paperspace, kaggle or colab you can use my [Jupyter N

 ## Manual Install (Windows, Linux)

+Note that some dependencies do not yet support python 3.13 so using 3.12 is recommended.
+
 Git clone this repo.

 Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
@@ -102,17 +141,17 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2```

-This is the command to install the nightly with ROCm 6.0 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.2 which might have some performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.1```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2```

 ### NVIDIA

 Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124```

 This is the command to install pytorch nightly instead which might have performance improvements:

@@ -162,20 +201,6 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

-### I already have another UI for Stable Diffusion installed do I really have to install all of these dependencies?
-
-You don't. If you have another UI installed and working with its own python venv you can use that venv to run ComfyUI. You can open up your favorite terminal and activate it:
-
-```source path_to_other_sd_gui/venv/bin/activate```
-
-or on Windows:
-
-With Powershell: ```"path_to_other_sd_gui\venv\Scripts\Activate.ps1"```
-
-With cmd.exe: ```"path_to_other_sd_gui\venv\Scripts\activate.bat"```
-
-And then you can use that terminal to run ComfyUI without installing any dependencies. Note that the venv folder might be called something else depending on the SD UI.
-
 # Running

 ```python main.py```
@@ -211,7 +236,7 @@ To use a textual inversion concepts/embeddings in a text prompt put them in the

 Use ```--preview-method auto``` to enable previews.

-The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) (for SD1.x and SD2.x) and [taesdxl_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesdxl_decoder.pth) (for SDXL) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.
+The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth, taesdxl_decoder.pth, taesd3_decoder.pth and taef1_decoder.pth](https://github.com/madebyollin/taesd/) and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI and launch it with `--preview-method taesd` to enable high-quality previews.

 ## How to use TLS/SSL?
 Generate a self-signed certificate (not appropriate for shared/production use) and key by running the command: `openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname"`
@@ -227,6 +252,47 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w

 See also: [https://www.comfy.org/](https://www.comfy.org/)

+## Frontend Development
+
+As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
+
+### Reporting Issues and Requesting Features
+
+For any bugs, issues, or feature requests related to the frontend, please use the [ComfyUI Frontend repository](https://github.com/Comfy-Org/ComfyUI_frontend). This will help us manage and address frontend-specific concerns more efficiently.
+
+### Using the Latest Frontend
+
+The new frontend is now the default for ComfyUI. However, please note:
+
+1. The frontend in the main ComfyUI repository is updated weekly.
+2. Daily releases are available in the separate frontend repository.
+
+To use the most up-to-date frontend version:
+
+1. For the latest daily release, launch ComfyUI with this command line argument:
+
+   ```
+   --front-end-version Comfy-Org/ComfyUI_frontend@latest
+   ```
+
+2. For a specific version, replace `latest` with the desired version number:
+
+   ```
+   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
+   ```
+
+This approach allows you to easily switch between the stable weekly release and the cutting-edge daily updates, or even specific versions for testing purposes.
+
+### Accessing the Legacy Frontend
+
+If you need to use the legacy frontend for any reason, you can access it using the following command line argument:
+
+```
+--front-end-version Comfy-Org/ComfyUI_legacy_frontend@latest
+```
+
+This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy Frontend repository](https://github.com/Comfy-Org/ComfyUI_legacy_frontend).
+
 # QA

 ### Which GPU should I buy for this?
--- a/api_server/init.py
+++ b/api_server/init.py
--- a/api_server/routes/init.py
+++ b/api_server/routes/init.py
--- a/api_server/routes/internal/README.md
+++ b/api_server/routes/internal/README.md
@@ -0,0 +1,3 @@
+# ComfyUI Internal Routes
+
+All routes under the `/internal` path are designated for **internal use by ComfyUI only**. These routes are not intended for use by external applications may change at any time without notice.
--- a/api_server/routes/internal/init.py
+++ b/api_server/routes/internal/init.py
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@@ -0,0 +1,76 @@
+from aiohttp import web
+from typing import Optional
+from folder_paths import models_dir, user_directory, output_directory, folder_names_and_paths
+from api_server.services.file_service import FileService
+from api_server.services.terminal_service import TerminalService
+import app.logger
+
+class InternalRoutes:
+    '''
+    The top level web router for internal routes: /internal/*
+    The endpoints here should NOT be depended upon. It is for ComfyUI frontend use only.
+    Check README.md for more information.
+    
+    '''
+
+    def __init__(self, prompt_server):
+        self.routes: web.RouteTableDef = web.RouteTableDef()
+        self._app: Optional[web.Application] = None
+        self.file_service = FileService({
+            "models": models_dir,
+            "user": user_directory,
+            "output": output_directory
+        })
+        self.prompt_server = prompt_server
+        self.terminal_service = TerminalService(prompt_server)
+
+    def setup_routes(self):
+        @self.routes.get('/files')
+        async def list_files(request):
+            directory_key = request.query.get('directory', '')
+            try:
+                file_list = self.file_service.list_files(directory_key)
+                return web.json_response({"files": file_list})
+            except ValueError as e:
+                return web.json_response({"error": str(e)}, status=400)
+            except Exception as e:
+                return web.json_response({"error": str(e)}, status=500)
+
+        @self.routes.get('/logs')
+        async def get_logs(request):
+            return web.json_response("".join([(l["t"] + " - " + l["m"]) for l in app.logger.get_logs()]))
+
+        @self.routes.get('/logs/raw')
+        async def get_logs(request):
+            self.terminal_service.update_size()
+            return web.json_response({
+                "entries": list(app.logger.get_logs()),
+                "size": {"cols": self.terminal_service.cols, "rows": self.terminal_service.rows}
+            })
+
+        @self.routes.patch('/logs/subscribe')
+        async def subscribe_logs(request):
+            json_data = await request.json()
+            client_id = json_data["clientId"]
+            enabled = json_data["enabled"]
+            if enabled:
+                self.terminal_service.subscribe(client_id)
+            else:
+                self.terminal_service.unsubscribe(client_id)
+
+            return web.Response(status=200)
+
+
+        @self.routes.get('/folder_paths')
+        async def get_folder_paths(request):
+            response = {}
+            for key in folder_names_and_paths:
+                response[key] = folder_names_and_paths[key][0]
+            return web.json_response(response)
+
+    def get_app(self):
+        if self._app is None:
+            self._app = web.Application()
+            self.setup_routes()
+            self._app.add_routes(self.routes)
+        return self._app
--- a/api_server/services/init.py
+++ b/api_server/services/init.py
--- a/api_server/services/file_service.py
+++ b/api_server/services/file_service.py
@@ -0,0 +1,13 @@
+from typing import Dict, List, Optional
+from api_server.utils.file_operations import FileSystemOperations, FileSystemItem
+
+class FileService:
+    def __init__(self, allowed_directories: Dict[str, str], file_system_ops: Optional[FileSystemOperations] = None):
+        self.allowed_directories: Dict[str, str] = allowed_directories
+        self.file_system_ops: FileSystemOperations = file_system_ops or FileSystemOperations()
+
+    def list_files(self, directory_key: str) -> List[FileSystemItem]:
+        if directory_key not in self.allowed_directories:
+            raise ValueError("Invalid directory key")
+        directory_path: str = self.allowed_directories[directory_key]
+        return self.file_system_ops.walk_directory(directory_path)
--- a/api_server/services/terminal_service.py
+++ b/api_server/services/terminal_service.py
@@ -0,0 +1,60 @@
+from app.logger import on_flush
+import os
+import shutil
+
+
+class TerminalService:
+    def __init__(self, server):
+        self.server = server
+        self.cols = None
+        self.rows = None
+        self.subscriptions = set()
+        on_flush(self.send_messages)
+
+    def get_terminal_size(self):
+        try:
+            size = os.get_terminal_size()
+            return (size.columns, size.lines)
+        except OSError:
+            try:
+                size = shutil.get_terminal_size()
+                return (size.columns, size.lines)
+            except OSError:
+                return (80, 24)  # fallback to 80x24
+
+    def update_size(self):
+        columns, lines = self.get_terminal_size()
+        changed = False
+        
+        if columns != self.cols:
+            self.cols = columns
+            changed = True 
+
+        if lines != self.rows:
+            self.rows = lines
+            changed = True
+
+        if changed:
+            return {"cols": self.cols, "rows": self.rows}
+
+        return None
+
+    def subscribe(self, client_id):
+        self.subscriptions.add(client_id)
+
+    def unsubscribe(self, client_id):
+        self.subscriptions.discard(client_id)
+
+    def send_messages(self, entries):
+        if not len(entries) or not len(self.subscriptions):
+            return
+        
+        new_size = self.update_size()
+        
+        for client_id in self.subscriptions.copy(): # prevent: Set changed size during iteration
+            if client_id not in self.server.sockets:
+                # Automatically unsub if the socket has disconnected
+                self.unsubscribe(client_id)
+                continue
+
+            self.server.send_sync("logs", {"entries": entries, "size": new_size}, client_id)
--- a/api_server/utils/file_operations.py
+++ b/api_server/utils/file_operations.py
@@ -0,0 +1,42 @@
+import os
+from typing import List, Union, TypedDict, Literal
+from typing_extensions import TypeGuard
+class FileInfo(TypedDict):
+    name: str
+    path: str
+    type: Literal["file"]
+    size: int
+
+class DirectoryInfo(TypedDict):
+    name: str
+    path: str
+    type: Literal["directory"]
+
+FileSystemItem = Union[FileInfo, DirectoryInfo]
+
+def is_file_info(item: FileSystemItem) -> TypeGuard[FileInfo]:
+    return item["type"] == "file"
+
+class FileSystemOperations:
+    @staticmethod
+    def walk_directory(directory: str) -> List[FileSystemItem]:
+        file_list: List[FileSystemItem] = []
+        for root, dirs, files in os.walk(directory):
+            for name in files:
+                file_path = os.path.join(root, name)
+                relative_path = os.path.relpath(file_path, directory)
+                file_list.append({
+                    "name": name,
+                    "path": relative_path,
+                    "type": "file",
+                    "size": os.path.getsize(file_path)
+                })
+            for name in dirs:
+                dir_path = os.path.join(root, name)
+                relative_path = os.path.relpath(dir_path, directory)
+                file_list.append({
+                    "name": name,
+                    "path": relative_path,
+                    "type": "directory"
+                })
+        return file_list
--- a/app/init.py
+++ b/app/init.py
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -0,0 +1,204 @@
+from __future__ import annotations
+import argparse
+import logging
+import os
+import re
+import tempfile
+import zipfile
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from typing import TypedDict, Optional
+
+import requests
+from typing_extensions import NotRequired
+from comfy.cli_args import DEFAULT_VERSION_STRING
+
+
+REQUEST_TIMEOUT = 10  # seconds
+
+
+class Asset(TypedDict):
+    url: str
+
+
+class Release(TypedDict):
+    id: int
+    tag_name: str
+    name: str
+    prerelease: bool
+    created_at: str
+    published_at: str
+    body: str
+    assets: NotRequired[list[Asset]]
+
+
+@dataclass
+class FrontEndProvider:
+    owner: str
+    repo: str
+
+    @property
+    def folder_name(self) -> str:
+        return f"{self.owner}_{self.repo}"
+
+    @property
+    def release_url(self) -> str:
+        return f"https://api.github.com/repos/{self.owner}/{self.repo}/releases"
+
+    @cached_property
+    def all_releases(self) -> list[Release]:
+        releases = []
+        api_url = self.release_url
+        while api_url:
+            response = requests.get(api_url, timeout=REQUEST_TIMEOUT)
+            response.raise_for_status()  # Raises an HTTPError if the response was an error
+            releases.extend(response.json())
+            # GitHub uses the Link header to provide pagination links. Check if it exists and update api_url accordingly.
+            if "next" in response.links:
+                api_url = response.links["next"]["url"]
+            else:
+                api_url = None
+        return releases
+
+    @cached_property
+    def latest_release(self) -> Release:
+        latest_release_url = f"{self.release_url}/latest"
+        response = requests.get(latest_release_url, timeout=REQUEST_TIMEOUT)
+        response.raise_for_status()  # Raises an HTTPError if the response was an error
+        return response.json()
+
+    def get_release(self, version: str) -> Release:
+        if version == "latest":
+            return self.latest_release
+        else:
+            for release in self.all_releases:
+                if release["tag_name"] in [version, f"v{version}"]:
+                    return release
+            raise ValueError(f"Version {version} not found in releases")
+
+
+def download_release_asset_zip(release: Release, destination_path: str) -> None:
+    """Download dist.zip from github release."""
+    asset_url = None
+    for asset in release.get("assets", []):
+        if asset["name"] == "dist.zip":
+            asset_url = asset["url"]
+            break
+
+    if not asset_url:
+        raise ValueError("dist.zip not found in the release assets")
+
+    # Use a temporary file to download the zip content
+    with tempfile.TemporaryFile() as tmp_file:
+        headers = {"Accept": "application/octet-stream"}
+        response = requests.get(
+            asset_url, headers=headers, allow_redirects=True, timeout=REQUEST_TIMEOUT
+        )
+        response.raise_for_status()  # Ensure we got a successful response
+
+        # Write the content to the temporary file
+        tmp_file.write(response.content)
+
+        # Go back to the beginning of the temporary file
+        tmp_file.seek(0)
+
+        # Extract the zip file content to the destination path
+        with zipfile.ZipFile(tmp_file, "r") as zip_ref:
+            zip_ref.extractall(destination_path)
+
+
+class FrontendManager:
+    DEFAULT_FRONTEND_PATH = str(Path(__file__).parents[1] / "web")
+    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
+
+    @classmethod
+    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
+        """
+        Args:
+            value (str): The version string to parse.
+
+        Returns:
+            tuple[str, str]: A tuple containing provider name and version.
+
+        Raises:
+            argparse.ArgumentTypeError: If the version string is invalid.
+        """
+        VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+|latest)$"
+        match_result = re.match(VERSION_PATTERN, value)
+        if match_result is None:
+            raise argparse.ArgumentTypeError(f"Invalid version string: {value}")
+
+        return match_result.group(1), match_result.group(2), match_result.group(3)
+
+    @classmethod
+    def init_frontend_unsafe(cls, version_string: str, provider: Optional[FrontEndProvider] = None) -> str:
+        """
+        Initializes the frontend for the specified version.
+
+        Args:
+            version_string (str): The version string.
+            provider (FrontEndProvider, optional): The provider to use. Defaults to None.
+
+        Returns:
+            str: The path to the initialized frontend.
+
+        Raises:
+            Exception: If there is an error during the initialization process.
+            main error source might be request timeout or invalid URL.
+        """
+        if version_string == DEFAULT_VERSION_STRING:
+            return cls.DEFAULT_FRONTEND_PATH
+
+        repo_owner, repo_name, version = cls.parse_version_string(version_string)
+
+        if version.startswith("v"):
+            expected_path = str(Path(cls.CUSTOM_FRONTENDS_ROOT) / f"{repo_owner}_{repo_name}" / version.lstrip("v"))
+            if os.path.exists(expected_path):
+                logging.info(f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}")
+                return expected_path
+
+        logging.info(f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub...")
+
+        provider = provider or FrontEndProvider(repo_owner, repo_name)
+        release = provider.get_release(version)
+
+        semantic_version = release["tag_name"].lstrip("v")
+        web_root = str(
+            Path(cls.CUSTOM_FRONTENDS_ROOT) / provider.folder_name / semantic_version
+        )
+        if not os.path.exists(web_root):
+            try:
+                os.makedirs(web_root, exist_ok=True)
+                logging.info(
+                    "Downloading frontend(%s) version(%s) to (%s)",
+                    provider.folder_name,
+                    semantic_version,
+                    web_root,
+                )
+                logging.debug(release)
+                download_release_asset_zip(release, destination_path=web_root)
+            finally:
+                # Clean up the directory if it is empty, i.e. the download failed
+                if not os.listdir(web_root):
+                    os.rmdir(web_root)
+
+        return web_root
+
+    @classmethod
+    def init_frontend(cls, version_string: str) -> str:
+        """
+        Initializes the frontend with the specified version string.
+
+        Args:
+            version_string (str): The version string to initialize the frontend with.
+
+        Returns:
+            str: The path of the initialized frontend.
+        """
+        try:
+            return cls.init_frontend_unsafe(version_string)
+        except Exception as e:
+            logging.error("Failed to initialize frontend: %s", e)
+            logging.info("Falling back to the default frontend.")
+            return cls.DEFAULT_FRONTEND_PATH
--- a/app/logger.py
+++ b/app/logger.py
@@ -0,0 +1,73 @@
+from collections import deque
+from datetime import datetime
+import io
+import logging
+import sys
+import threading
+
+logs = None
+stdout_interceptor = None
+stderr_interceptor = None
+
+
+class LogInterceptor(io.TextIOWrapper):
+    def __init__(self, stream,  *args, **kwargs):
+        buffer = stream.buffer
+        encoding = stream.encoding
+        super().__init__(buffer, *args, **kwargs, encoding=encoding, line_buffering=stream.line_buffering)
+        self._lock = threading.Lock()
+        self._flush_callbacks = []
+        self._logs_since_flush = []
+
+    def write(self, data):
+        entry = {"t": datetime.now().isoformat(), "m": data}
+        with self._lock:
+            self._logs_since_flush.append(entry)
+
+            # Simple handling for cr to overwrite the last output if it isnt a full line
+            # else logs just get full of progress messages
+            if isinstance(data, str) and data.startswith("\r") and not logs[-1]["m"].endswith("\n"):
+                logs.pop()
+            logs.append(entry)
+        super().write(data)
+
+    def flush(self):
+        super().flush()
+        for cb in self._flush_callbacks:
+            cb(self._logs_since_flush)
+            self._logs_since_flush = []
+
+    def on_flush(self, callback):
+        self._flush_callbacks.append(callback)
+
+
+def get_logs():
+    return logs
+
+
+def on_flush(callback):
+    if stdout_interceptor is not None:
+        stdout_interceptor.on_flush(callback)
+    if stderr_interceptor is not None:
+        stderr_interceptor.on_flush(callback)
+
+def setup_logger(log_level: str = 'INFO', capacity: int = 300):
+    global logs
+    if logs:
+        return
+
+    # Override output streams and log to buffer
+    logs = deque(maxlen=capacity)
+
+    global stdout_interceptor
+    global stderr_interceptor
+    stdout_interceptor = sys.stdout = LogInterceptor(sys.stdout)
+    stderr_interceptor = sys.stderr = LogInterceptor(sys.stderr)
+
+    # Setup default global logger
+    logger = logging.getLogger()
+    logger.setLevel(log_level)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(logging.Formatter("%(message)s"))
+    logger.addHandler(stream_handler)
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -1,21 +1,38 @@
+from __future__ import annotations
 import json
 import os
 import re
 import uuid
 import glob
 import shutil
+import logging
 from aiohttp import web
+from urllib import parse
 from comfy.cli_args import args
-from folder_paths import user_directory
+import folder_paths
 from .app_settings import AppSettings
+from typing import TypedDict

 default_user = "default"
-users_file = os.path.join(user_directory, "users.json")
+
+
+class FileInfo(TypedDict):
+    path: str
+    size: int
+    modified: int
+
+
+def get_file_info(path: str, relative_to: str) -> FileInfo:
+    return {
+        "path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
+        "size": os.path.getsize(path),
+        "modified": os.path.getmtime(path)
+    }


 class UserManager():
    def __init__(self):
-        global user_directory
+        user_directory = folder_paths.get_user_directory()

        self.settings = AppSettings(self)
        if not os.path.exists(user_directory):
@@ -25,14 +42,17 @@ class UserManager():
                print("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")

        if args.multi_user:
-            if os.path.isfile(users_file):
-                with open(users_file) as f:
+            if os.path.isfile(self.get_users_file()):
+                with open(self.get_users_file()) as f:
                    self.users = json.load(f)
            else:
                self.users = {}
        else:
            self.users = {"default": "default"}

+    def get_users_file(self):
+        return os.path.join(folder_paths.get_user_directory(), "users.json")
+
    def get_request_user_id(self, request):
        user = "default"
        if args.multi_user and "comfy-user" in request.headers:
@@ -44,7 +64,7 @@ class UserManager():
        return user

    def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
-        global user_directory
+        user_directory = folder_paths.get_user_directory()

        if type == "userdata":
            root_dir = user_directory
@@ -59,6 +79,10 @@ class UserManager():
            return None

        if file is not None:
+            # Check if filename is url encoded
+            if "%" in file:
+                file = parse.unquote(file)
+
            # prevent leaving /{type}/{user}
            path = os.path.abspath(os.path.join(user_root, file))
            if os.path.commonpath((user_root, path)) != user_root:
@@ -80,8 +104,7 @@ class UserManager():

        self.users[user_id] = name

-        global users_file
-        with open(users_file, "w") as f:
+        with open(self.get_users_file(), "w") as f:
            json.dump(self.users, f)

        return user_id
@@ -112,25 +135,65 @@ class UserManager():

        @routes.get("/userdata")
        async def listuserdata(request):
+            """
+            List user data files in a specified directory.
+
+            This endpoint allows listing files in a user's data directory, with options for recursion,
+            full file information, and path splitting.
+
+            Query Parameters:
+            - dir (required): The directory to list files from.
+            - recurse (optional): If "true", recursively list files in subdirectories.
+            - full_info (optional): If "true", return detailed file information (path, size, modified time).
+            - split (optional): If "true", split file paths into components (only applies when full_info is false).
+
+            Returns:
+            - 400: If 'dir' parameter is missing.
+            - 403: If the requested path is not allowed.
+            - 404: If the requested directory does not exist.
+            - 200: JSON response with the list of files or file information.
+
+            The response format depends on the query parameters:
+            - Default: List of relative file paths.
+            - full_info=true: List of dictionaries with file details.
+            - split=true (and full_info=false): List of lists, each containing path components.
+            """
            directory = request.rel_url.query.get('dir', '')
            if not directory:
-                return web.Response(status=400)
-                
+                return web.Response(status=400, text="Directory not provided")
+
            path = self.get_request_user_filepath(request, directory)
            if not path:
-                return web.Response(status=403)
-            
+                return web.Response(status=403, text="Invalid directory")
+
            if not os.path.exists(path):
-                return web.Response(status=404)
-            
+                return web.Response(status=404, text="Directory not found")
+
            recurse = request.rel_url.query.get('recurse', '').lower() == "true"
-            results = glob.glob(os.path.join(
-                glob.escape(path), '**/*'), recursive=recurse)
-            results = [os.path.relpath(x, path) for x in results if os.path.isfile(x)]
-            
+            full_info = request.rel_url.query.get('full_info', '').lower() == "true"
            split_path = request.rel_url.query.get('split', '').lower() == "true"
-            if split_path:
-                results = [[x] + x.split(os.sep) for x in results]
+
+            # Use different patterns based on whether we're recursing or not
+            if recurse:
+                pattern = os.path.join(glob.escape(path), '**', '*')
+            else:
+                pattern = os.path.join(glob.escape(path), '*')
+
+            def process_full_path(full_path: str) -> FileInfo | str | list[str]:
+                if full_info:
+                    return get_file_info(full_path, path)
+
+                rel_path = os.path.relpath(full_path, path).replace(os.sep, '/')
+                if split_path:
+                    return [rel_path] + rel_path.split('/')
+
+                return rel_path
+
+            results = [
+                process_full_path(full_path)
+                for full_path in glob.glob(pattern, recursive=recurse)
+                if os.path.isfile(full_path)
+            ]

            return web.json_response(results)

@@ -138,14 +201,14 @@ class UserManager():
            file = request.match_info.get(param, None)
            if not file:
                return web.Response(status=400)
-                
+
            path = self.get_request_user_filepath(request, file)
            if not path:
                return web.Response(status=403)
-            
+
            if check_exists and not os.path.exists(path):
                return web.Response(status=404)
-            
+
            return path

        @routes.get("/userdata/{file}")
@@ -153,25 +216,56 @@ class UserManager():
            path = get_user_data_path(request, check_exists=True)
            if not isinstance(path, str):
                return path
-            
+
            return web.FileResponse(path)

        @routes.post("/userdata/{file}")
        async def post_userdata(request):
+            """
+            Upload or update a user data file.
+
+            This endpoint handles file uploads to a user's data directory, with options for
+            controlling overwrite behavior and response format.
+
+            Query Parameters:
+            - overwrite (optional): If "false", prevents overwriting existing files. Defaults to "true".
+            - full_info (optional): If "true", returns detailed file information (path, size, modified time).
+                                  If "false", returns only the relative file path.
+
+            Path Parameters:
+            - file: The target file path (URL encoded if necessary).
+
+            Returns:
+            - 400: If 'file' parameter is missing.
+            - 403: If the requested path is not allowed.
+            - 409: If overwrite=false and the file already exists.
+            - 200: JSON response with either:
+                  - Full file information (if full_info=true)
+                  - Relative file path (if full_info=false)
+
+            The request body should contain the raw file content to be written.
+            """
            path = get_user_data_path(request)
            if not isinstance(path, str):
                return path
-            
-            overwrite = request.query["overwrite"] != "false"
+
+            overwrite = request.query.get("overwrite", 'true') != "false"
+            full_info = request.query.get('full_info', 'false').lower() == "true"
+
            if not overwrite and os.path.exists(path):
-                return web.Response(status=409)
+                return web.Response(status=409, text="File already exists")

            body = await request.read()

            with open(path, "wb") as f:
                f.write(body)
-                
-            resp = os.path.relpath(path, self.get_request_user_filepath(request, None))
+
+            user_path = self.get_request_user_filepath(request, None)
+            if full_info:
+                resp = get_file_info(path, user_path)
+            else:
+                resp = os.path.relpath(path, user_path)
+
            return web.json_response(resp)

        @routes.delete("/userdata/{file}")
@@ -181,25 +275,56 @@ class UserManager():
                return path

            os.remove(path)
-                
+
            return web.Response(status=204)

        @routes.post("/userdata/{file}/move/{dest}")
        async def move_userdata(request):
+            """
+            Move or rename a user data file.
+
+            This endpoint handles moving or renaming files within a user's data directory, with options for
+            controlling overwrite behavior and response format.
+
+            Path Parameters:
+            - file: The source file path (URL encoded if necessary)
+            - dest: The destination file path (URL encoded if necessary)
+
+            Query Parameters:
+            - overwrite (optional): If "false", prevents overwriting existing files. Defaults to "true".
+            - full_info (optional): If "true", returns detailed file information (path, size, modified time).
+                                  If "false", returns only the relative file path.
+
+            Returns:
+            - 400: If either 'file' or 'dest' parameter is missing
+            - 403: If either requested path is not allowed
+            - 404: If the source file does not exist
+            - 409: If overwrite=false and the destination file already exists
+            - 200: JSON response with either:
+                  - Full file information (if full_info=true)
+                  - Relative file path (if full_info=false)
+            """
            source = get_user_data_path(request, check_exists=True)
            if not isinstance(source, str):
                return source
-            
+
            dest = get_user_data_path(request, check_exists=False, param="dest")
            if not isinstance(source, str):
                return dest
-            
-            overwrite = request.query["overwrite"] != "false"
-            if not overwrite and os.path.exists(dest):
-                return web.Response(status=409)

-            print(f"moving '{source}' -> '{dest}'")
+            overwrite = request.query.get("overwrite", 'true') != "false"
+            full_info = request.query.get('full_info', 'false').lower() == "true"
+
+            if not overwrite and os.path.exists(dest):
+                return web.Response(status=409, text="File already exists")
+
+            logging.info(f"moving '{source}' -> '{dest}'")
            shutil.move(source, dest)
-                
-            resp = os.path.relpath(dest, self.get_request_user_filepath(request, None))
+
+            user_path = self.get_request_user_filepath(request, None)
+            if full_info:
+                resp = get_file_info(dest, user_path)
+            else:
+                resp = os.path.relpath(dest, user_path)
+
            return web.json_response(resp)
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@@ -13,6 +13,7 @@ from ..ldm.modules.diffusionmodules.util import (
 from ..ldm.modules.attention import SpatialTransformer
 from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
 from ..ldm.util import exists
+from .control_types import UNION_CONTROLNET_TYPES
 from collections import OrderedDict
 import comfy.ops
 from comfy.ldm.modules.attention import optimized_attention
@@ -92,7 +93,7 @@ class ControlNet(nn.Module):
        transformer_depth_middle=None,
        transformer_depth_output=None,
        attn_precision=None,
-        union_controlnet=False,
+        union_controlnet_num_control_type=None,
        device=None,
        operations=comfy.ops.disable_weight_init,
        **kwargs,
@@ -320,8 +321,8 @@ class ControlNet(nn.Module):
        self.middle_block_out = self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device)
        self._feature_size += ch

-        if union_controlnet:
-            self.num_control_type = 6
+        if union_controlnet_num_control_type is not None:
+            self.num_control_type = union_controlnet_num_control_type
            num_trans_channel = 320
            num_trans_head = 8
            num_trans_layer = 1
@@ -361,7 +362,7 @@ class ControlNet(nn.Module):
            controlnet_cond = self.input_hint_block(hint[idx], emb, context)
            feat_seq = torch.mean(controlnet_cond, dim=(2, 3))
            if idx < len(control_type):
-                feat_seq += self.task_embedding[control_type[idx]]
+                feat_seq += self.task_embedding[control_type[idx]].to(dtype=feat_seq.dtype, device=feat_seq.device)

            inputs.append(feat_seq.unsqueeze(1))
            condition_list.append(controlnet_cond)
@@ -390,6 +391,18 @@ class ControlNet(nn.Module):
        if self.control_add_embedding is not None: #Union Controlnet
            control_type = kwargs.get("control_type", [])

+            if any([c >= self.num_control_type for c in control_type]):
+                max_type = max(control_type)
+                max_type_name = {
+                    v: k for k, v in UNION_CONTROLNET_TYPES.items()
+                }[max_type]
+                raise ValueError(
+                    f"Control type {max_type_name}({max_type}) is out of range for the number of control types" +
+                    f"({self.num_control_type}) supported.\n" +
+                    "Please consider using the ProMax ControlNet Union model.\n" +
+                    "https://huggingface.co/xinsir/controlnet-union-sdxl-1.0/tree/main"
+                )
+
            emb += self.control_add_embedding(control_type, emb.dtype, emb.device)
            if len(control_type) > 0:
                if len(hint.shape) < 5:
--- a/comfy/cldm/control_types.py
+++ b/comfy/cldm/control_types.py
@@ -0,0 +1,10 @@
+UNION_CONTROLNET_TYPES = {
+    "openpose": 0,
+    "depth": 1,
+    "hed/pidi/scribble/ted": 2,
+    "canny/lineart/anime_lineart/mlsd": 3,
+    "normal": 4,
+    "segment": 5,
+    "tile": 6,
+    "repaint": 7,
+}
--- a/comfy/cldm/mmdit.py
+++ b/comfy/cldm/mmdit.py
@@ -6,6 +6,7 @@ class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
    def __init__(
        self,
        num_blocks = None,
+        control_latent_channels = None,
        dtype = None,
        device = None,
        operations = None,
@@ -17,10 +18,13 @@ class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
        for _ in range(len(self.joint_blocks)):
            self.controlnet_blocks.append(operations.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype))

+        if control_latent_channels is None:
+            control_latent_channels = self.in_channels
+
        self.pos_embed_input = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(
            None,
            self.patch_size,
-            self.in_channels,
+            control_latent_channels,
            self.hidden_size,
            bias=True,
            strict_img_size=False,
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,7 +1,10 @@
 import argparse
 import enum
+import os
+from typing import Optional
 import comfy.options

+
 class EnumAction(argparse.Action):
    """
    Argparse action for handling Enums
@@ -33,7 +36,7 @@ class EnumAction(argparse.Action):

 parser = argparse.ArgumentParser()

-parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)")
+parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0,::", help="Specify the IP address to listen on (default: 127.0.0.1). You can give a list of ip addresses by separating them with a comma like: 127.2.2.2,127.3.3.3 If --listen is provided without an argument, it defaults to 0.0.0.0,:: (listens on all ipv4 and ipv6)")
 parser.add_argument("--port", type=int, default=8188, help="Set the listen port.")
 parser.add_argument("--tls-keyfile", type=str, help="Path to TLS (SSL) key file. Enables TLS, makes app accessible at https://... requires --tls-certfile to function")
 parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certificate file. Enables TLS, makes app accessible at https://... requires --tls-keyfile to function")
@@ -89,6 +92,12 @@ class LatentPreviewMethod(enum.Enum):

 parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)

+parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
+
+cache_group = parser.add_mutually_exclusive_group()
+cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
+cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
+
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
@@ -109,9 +118,14 @@ vram_group.add_argument("--lowvram", action="store_true", help="Split the unet i
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")

+parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reverved depending on your OS.")
+
+
+parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")

 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
+parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")

 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
@@ -122,8 +136,42 @@ parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Dis

 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")

-parser.add_argument("--verbose", action="store_true", help="Enables more debug prints.")
+parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')

+# The default built-in provider hosted under web/
+DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
+
+parser.add_argument(
+    "--front-end-version",
+    type=str,
+    default=DEFAULT_VERSION_STRING,
+    help="""
+    Specifies the version of the frontend to be used. This command needs internet connectivity to query and
+    download available frontend implementations from GitHub releases.
+
+    The version string should be in the format of:
+    [repoOwner]/[repoName]@[version]
+    where version is one of: "latest" or a valid version number (e.g. "1.0.0")
+    """,
+)
+
+def is_valid_directory(path: Optional[str]) -> Optional[str]:
+    """Validate if the given path is a directory."""
+    if path is None:
+        return None
+
+    if not os.path.isdir(path):
+        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
+    return path
+
+parser.add_argument(
+    "--front-end-root",
+    type=is_valid_directory,
+    default=None,
+    help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.",
+)
+
+parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path.")

 if comfy.options.args_parsing:
    args = parser.parse_args()
@@ -135,10 +183,3 @@ if args.windows_standalone_build:

 if args.disable_auto_launch:
    args.auto_launch = False
-
-import logging
-logging_level = logging.INFO
-if args.verbose:
-    logging_level = logging.DEBUG
-
-logging.basicConfig(format="%(message)s", level=logging_level)
--- a/comfy/clip_config_bigg.json
+++ b/comfy/clip_config_bigg.json
@@ -5,7 +5,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 2,
+  "eos_token_id": 49407,
  "hidden_act": "gelu",
  "hidden_size": 1280,
  "initializer_factor": 1.0,
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -1,5 +1,6 @@
 import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
+import comfy.ops

 class CLIPAttention(torch.nn.Module):
    def __init__(self, embed_dim, heads, dtype, device, operations):
@@ -22,6 +23,7 @@ class CLIPAttention(torch.nn.Module):

 ACTIVATIONS = {"quick_gelu": lambda a: a * torch.sigmoid(1.702 * a),
               "gelu": torch.nn.functional.gelu,
+               "gelu_pytorch_tanh": lambda a: torch.nn.functional.gelu(a, approximate="tanh"),
 }

 class CLIPMLP(torch.nn.Module):
@@ -71,13 +73,13 @@ class CLIPEncoder(torch.nn.Module):
        return x, intermediate

 class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
+    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None, operations=None):
        super().__init__()
-        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.token_embedding = operations.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens):
-        return self.token_embedding(input_tokens) + self.position_embedding.weight
+    def forward(self, input_tokens, dtype=torch.float32):
+        return self.token_embedding(input_tokens, out_dtype=dtype) + comfy.ops.cast_to(self.position_embedding.weight, dtype=dtype, device=input_tokens.device)


 class CLIPTextModel_(torch.nn.Module):
@@ -87,14 +89,16 @@ class CLIPTextModel_(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]
+        num_positions = config_dict["max_position_embeddings"]
+        self.eos_token_id = config_dict["eos_token_id"]

        super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
+        self.embeddings = CLIPEmbeddings(embed_dim, num_positions=num_positions, dtype=dtype, device=device, operations=operations)
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
-        x = self.embeddings(input_tokens)
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -111,7 +115,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)

-        pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output

 class CLIPTextModel(torch.nn.Module):
@@ -121,7 +125,6 @@ class CLIPTextModel(torch.nn.Module):
        self.text_model = CLIPTextModel_(config_dict, dtype, device, operations)
        embed_dim = config_dict["hidden_size"]
        self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
-        self.text_projection.weight.copy_(torch.eye(embed_dim))
        self.dtype = dtype

    def get_input_embeddings(self):
@@ -137,27 +140,35 @@ class CLIPTextModel(torch.nn.Module):


 class CLIPVisionEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dtype=None, device=None, operations=None):
+    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, model_type="", dtype=None, device=None, operations=None):
        super().__init__()
-        self.class_embedding = torch.nn.Parameter(torch.empty(embed_dim, dtype=dtype, device=device))
+
+        num_patches = (image_size // patch_size) ** 2
+        if model_type == "siglip_vision_model":
+            self.class_embedding = None
+            patch_bias = True
+        else:
+            num_patches = num_patches + 1
+            self.class_embedding = torch.nn.Parameter(torch.empty(embed_dim, dtype=dtype, device=device))
+            patch_bias = False

        self.patch_embedding = operations.Conv2d(
            in_channels=num_channels,
            out_channels=embed_dim,
            kernel_size=patch_size,
            stride=patch_size,
-            bias=False,
+            bias=patch_bias,
            dtype=dtype,
            device=device
        )

-        num_patches = (image_size // patch_size) ** 2
-        num_positions = num_patches + 1
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_patches, embed_dim, dtype=dtype, device=device)

    def forward(self, pixel_values):
        embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
-        return torch.cat([self.class_embedding.to(embeds.device).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight.to(embeds.device)
+        if self.class_embedding is not None:
+            embeds = torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1)
+        return embeds + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)


 class CLIPVision(torch.nn.Module):
@@ -168,9 +179,15 @@ class CLIPVision(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]
+        model_type = config_dict["model_type"]

-        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
-        self.pre_layrnorm = operations.LayerNorm(embed_dim)
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
+        if model_type == "siglip_vision_model":
+            self.pre_layrnorm = lambda a: a
+            self.output_layernorm = True
+        else:
+            self.pre_layrnorm = operations.LayerNorm(embed_dim)
+            self.output_layernorm = False
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.post_layernorm = operations.LayerNorm(embed_dim)

@@ -179,14 +196,21 @@ class CLIPVision(torch.nn.Module):
        x = self.pre_layrnorm(x)
        #TODO: attention_mask?
        x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
-        pooled_output = self.post_layernorm(x[:, 0, :])
+        if self.output_layernorm:
+            x = self.post_layernorm(x)
+            pooled_output = x
+        else:
+            pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output

 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        self.vision_model = CLIPVision(config_dict, dtype, device, operations)
-        self.visual_projection = operations.Linear(config_dict["hidden_size"], config_dict["projection_dim"], bias=False)
+        if "projection_dim" in config_dict:
+            self.visual_projection = operations.Linear(config_dict["hidden_size"], config_dict["projection_dim"], bias=False)
+        else:
+            self.visual_projection = lambda a: a

    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -16,9 +16,9 @@ class Output:
    def __setitem__(self, key, item):
        setattr(self, key, item)

-def clip_preprocess(image, size=224):
-    mean = torch.tensor([ 0.48145466,0.4578275,0.40821073], device=image.device, dtype=image.dtype)
-    std = torch.tensor([0.26862954,0.26130258,0.27577711], device=image.device, dtype=image.dtype)
+def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]):
+    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
+    std = torch.tensor(std, device=image.device, dtype=image.dtype)
    image = image.movedim(-1, 1)
    if not (image.shape[2] == size and image.shape[3] == size):
        scale = (size / min(image.shape[2], image.shape[3]))
@@ -34,6 +34,9 @@ class ClipVisionModel():
        with open(json_config) as f:
            config = json.load(f)

+        self.image_size = config.get("image_size", 224)
+        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
+        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
@@ -50,7 +53,7 @@ class ClipVisionModel():

    def encode_image(self, image):
        comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = clip_preprocess(image.to(self.load_device)).float()
+        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std).float()
        out = self.model(pixel_values=pixel_values, intermediate_output=-2)

        outputs = Output()
@@ -93,7 +96,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
-        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
+        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+        else:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
    else:
        return None

@@ -105,8 +113,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    keys = list(sd.keys())
    for k in keys:
        if k not in u:
-            t = sd.pop(k)
-            del t
+            sd.pop(k)
    return clip

 def load(ckpt_path):
--- a/comfy/clip_vision_config_vitl_336.json
+++ b/comfy/clip_vision_config_vitl_336.json
@@ -0,0 +1,18 @@
+{
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 336,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-5,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "torch_dtype": "float32"
+}
--- a/comfy/clip_vision_siglip_384.json
+++ b/comfy/clip_vision_siglip_384.json
@@ -0,0 +1,13 @@
+{
+  "num_channels": 3,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 384,
+  "intermediate_size": 4304,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 27,
+  "patch_size": 14,
+  "image_mean": [0.5, 0.5, 0.5],
+  "image_std": [0.5, 0.5, 0.5]
+}
--- a/comfy/comfy_types.py
+++ b/comfy/comfy_types.py
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -1,4 +1,24 @@
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+
 import torch
+from enum import Enum
 import math
 import os
 import logging
@@ -13,6 +33,8 @@ import comfy.cldm.cldm
 import comfy.t2i_adapter.adapter
 import comfy.ldm.cascade.controlnet
 import comfy.cldm.mmdit
+import comfy.ldm.hydit.controlnet
+import comfy.ldm.flux.controlnet


 def broadcast_image_to(tensor, target_batch_size, batched_number):
@@ -33,8 +55,12 @@ def broadcast_image_to(tensor, target_batch_size, batched_number):
    else:
        return torch.cat([tensor] * batched_number, dim=0)

+class StrengthType(Enum):
+    CONSTANT = 1
+    LINEAR_UP = 2
+
 class ControlBase:
-    def __init__(self, device=None):
+    def __init__(self):
        self.cond_hint_original = None
        self.cond_hint = None
        self.strength = 1.0
@@ -45,18 +71,25 @@ class ControlBase:
        self.timestep_range = None
        self.compression_ratio = 8
        self.upscale_algorithm = 'nearest-exact'
-
-        if device is None:
-            device = comfy.model_management.get_torch_device()
-        self.device = device
+        self.extra_args = {}
        self.previous_controlnet = None
+        self.extra_conds = []
+        self.strength_type = StrengthType.CONSTANT
+        self.concat_mask = False
+        self.extra_concat_orig = []
+        self.extra_concat = None

-    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None):
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
        self.cond_hint_original = cond_hint
        self.strength = strength
        self.timestep_percent_range = timestep_percent_range
        if self.latent_format is not None:
+            if vae is None:
+                logging.warning("WARNING: no VAE provided to the controlnet apply node when this controlnet requires one.")
            self.vae = vae
+        self.extra_concat_orig = extra_concat.copy()
+        if self.concat_mask and len(self.extra_concat_orig) == 0:
+            self.extra_concat_orig.append(torch.tensor([[[[1.0]]]]))
        return self

    def pre_run(self, model, percent_to_timestep_function):
@@ -71,9 +104,9 @@ class ControlBase:
    def cleanup(self):
        if self.previous_controlnet is not None:
            self.previous_controlnet.cleanup()
-        if self.cond_hint is not None:
-            del self.cond_hint
-            self.cond_hint = None
+
+        self.cond_hint = None
+        self.extra_concat = None
        self.timestep_range = None

    def get_models(self):
@@ -90,7 +123,12 @@ class ControlBase:
        c.compression_ratio = self.compression_ratio
        c.upscale_algorithm = self.upscale_algorithm
        c.latent_format = self.latent_format
+        c.extra_args = self.extra_args.copy()
        c.vae = self.vae
+        c.extra_conds = self.extra_conds.copy()
+        c.strength_type = self.strength_type
+        c.concat_mask = self.concat_mask
+        c.extra_concat_orig = self.extra_concat_orig.copy()

    def inference_memory_requirements(self, dtype):
        if self.previous_controlnet is not None:
@@ -111,9 +149,12 @@ class ControlBase:

                    if x not in applied_to: #memory saving strategy, allow shared tensors and only apply strength to shared tensors once
                        applied_to.add(x)
-                        x *= self.strength
+                        if self.strength_type == StrengthType.CONSTANT:
+                            x *= self.strength
+                        elif self.strength_type == StrengthType.LINEAR_UP:
+                            x *= (self.strength ** float(len(control_output) - i))

-                    if x.dtype != output_dtype:
+                    if output_dtype is not None and x.dtype != output_dtype:
                        x = x.to(output_dtype)

                out[key].append(x)
@@ -135,9 +176,13 @@ class ControlBase:
                                o[i] = prev_val + o[i] #TODO: change back to inplace add if shared tensors stop being an issue
        return out

+    def set_extra_arg(self, argument, value=None):
+        self.extra_args[argument] = value
+
+
 class ControlNet(ControlBase):
-    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None):
-        super().__init__(device)
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False):
+        super().__init__()
        self.control_model = control_model
        self.load_device = load_device
        if control_model is not None:
@@ -148,6 +193,9 @@ class ControlNet(ControlBase):
        self.model_sampling_current = None
        self.manual_cast_dtype = manual_cast_dtype
        self.latent_format = latent_format
+        self.extra_conds += extra_conds
+        self.strength_type = strength_type
+        self.concat_mask = concat_mask

    def get_control(self, x_noisy, t, cond, batched_number):
        control_prev = None
@@ -165,7 +213,6 @@ class ControlNet(ControlBase):
        if self.manual_cast_dtype is not None:
            dtype = self.manual_cast_dtype

-        output_dtype = x_noisy.dtype
        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
            if self.cond_hint is not None:
                del self.cond_hint
@@ -173,6 +220,9 @@ class ControlNet(ControlBase):
            compression_ratio = self.compression_ratio
            if self.vae is not None:
                compression_ratio *= self.vae.downscale_ratio
+            else:
+                if self.latent_format is not None:
+                    raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center")
            if self.vae is not None:
                loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
@@ -180,19 +230,30 @@ class ControlNet(ControlBase):
                comfy.model_management.load_models_gpu(loaded_models)
            if self.latent_format is not None:
                self.cond_hint = self.latent_format.process_in(self.cond_hint)
-            self.cond_hint = self.cond_hint.to(device=self.device, dtype=dtype)
+            if len(self.extra_concat_orig) > 0:
+                to_concat = []
+                for c in self.extra_concat_orig:
+                    c = c.to(self.cond_hint.device)
+                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
+                    to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
+                self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
+
+            self.cond_hint = self.cond_hint.to(device=x_noisy.device, dtype=dtype)
        if x_noisy.shape[0] != self.cond_hint.shape[0]:
            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)

        context = cond.get('crossattn_controlnet', cond['c_crossattn'])
-        y = cond.get('y', None)
-        if y is not None:
-            y = y.to(dtype)
+        extra = self.extra_args.copy()
+        for c in self.extra_conds:
+            temp = cond.get(c, None)
+            if temp is not None:
+                extra[c] = temp.to(dtype)
+
        timestep = self.model_sampling_current.timestep(t)
        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)

-        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(), context=context.to(dtype), y=y)
-        return self.control_merge(control, control_prev, output_dtype)
+        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra)
+        return self.control_merge(control, control_prev, output_dtype=None)

    def copy(self):
        c = ControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
@@ -276,10 +337,11 @@ class ControlLoraOps:


 class ControlLora(ControlNet):
-    def __init__(self, control_weights, global_average_pooling=False, device=None):
-        ControlBase.__init__(self, device)
+    def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options
+        ControlBase.__init__(self)
        self.control_weights = control_weights
        self.global_average_pooling = global_average_pooling
+        self.extra_conds += ["y"]

    def pre_run(self, model, percent_to_timestep_function):
        super().pre_run(model, percent_to_timestep_function)
@@ -332,43 +394,114 @@ class ControlLora(ControlNet):
    def inference_memory_requirements(self, dtype):
        return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)

-def load_controlnet_mmdit(sd):
-    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
-    model_config = comfy.model_detection.model_config_from_unet(new_sd, "", True)
-    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
-    for k in sd:
-        new_sd[k] = sd[k]
+def controlnet_config(sd, model_options={}):
+    model_config = comfy.model_detection.model_config_from_unet(sd, "", True)

-    supported_inference_dtypes = model_config.supported_inference_dtypes
+    unet_dtype = model_options.get("dtype", None)
+    if unet_dtype is None:
+        weight_dtype = comfy.utils.weight_dtype(sd)
+
+        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
+        if weight_dtype is not None:
+            supported_inference_dtypes.append(weight_dtype)
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)

-    controlnet_config = model_config.unet_config
-    unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)
    load_device = comfy.model_management.get_torch_device()
    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
-    if manual_cast_dtype is not None:
-        operations = comfy.ops.manual_cast
-    else:
-        operations = comfy.ops.disable_weight_init

-    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=load_device, dtype=unet_dtype, **controlnet_config)
-    missing, unexpected = control_model.load_state_dict(new_sd, strict=False)
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
+
+    offload_device = comfy.model_management.unet_offload_device()
+    return model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device
+
+def controlnet_load_state_dict(control_model, sd):
+    missing, unexpected = control_model.load_state_dict(sd, strict=False)

    if len(missing) > 0:
        logging.warning("missing controlnet keys: {}".format(missing))

    if len(unexpected) > 0:
        logging.debug("unexpected controlnet keys: {}".format(unexpected))
+    return control_model
+
+def load_controlnet_mmdit(sd, model_options={}):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
+    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
+    for k in sd:
+        new_sd[k] = sd[k]
+
+    concat_mask = False
+    control_latent_channels = new_sd.get("pos_embed_input.proj.weight").shape[1]
+    if control_latent_channels == 17: #inpaint controlnet
+        concat_mask = True
+
+    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, new_sd)

    latent_format = comfy.latent_formats.SD3()
    latent_format.shift_factor = 0 #SD3 controlnet weirdness
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
    return control


-def load_controlnet(ckpt_path, model=None):
-    controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
+def load_controlnet_hunyuandit(controlnet_data, model_options={}):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data, model_options=model_options)
+
+    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=offload_device, dtype=unet_dtype)
+    control_model = controlnet_load_state_dict(control_model, controlnet_data)
+
+    latent_format = comfy.latent_formats.SDXL()
+    extra_conds = ['text_embedding_mask', 'encoder_hidden_states_t5', 'text_embedding_mask_t5', 'image_meta_size', 'style', 'cos_cis_img', 'sin_cis_img']
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds, strength_type=StrengthType.CONSTANT)
+    return control
+
+def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, sd)
+    extra_conds = ['y', 'guidance']
+    control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
+
+def load_controlnet_flux_instantx(sd, model_options={}):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
+    for k in sd:
+        new_sd[k] = sd[k]
+
+    num_union_modes = 0
+    union_cnet = "controlnet_mode_embedder.weight"
+    if union_cnet in new_sd:
+        num_union_modes = new_sd[union_cnet].shape[0]
+
+    control_latent_channels = new_sd.get("pos_embed_input.weight").shape[1] // 4
+    concat_mask = False
+    if control_latent_channels == 17:
+        concat_mask = True
+
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(latent_input=True, num_union_modes=num_union_modes, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, new_sd)
+
+    latent_format = comfy.latent_formats.Flux()
+    extra_conds = ['y', 'guidance']
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
+
+def convert_mistoline(sd):
+    return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})
+
+
+def load_controlnet_state_dict(state_dict, model=None, model_options={}):
+    controlnet_data = state_dict
+    if 'after_proj_list.18.bias' in controlnet_data.keys(): #Hunyuan DiT
+        return load_controlnet_hunyuandit(controlnet_data, model_options=model_options)
+
    if "lora_controlnet" in controlnet_data:
-        return ControlLora(controlnet_data)
+        return ControlLora(controlnet_data, model_options=model_options)

    controlnet_config = None
    supported_inference_dtypes = None
@@ -414,7 +547,7 @@ def load_controlnet(ckpt_path, model=None):
                new_sd[diffusers_keys[k]] = controlnet_data.pop(k)

        if "control_add_embedding.linear_1.bias" in controlnet_data: #Union Controlnet
-            controlnet_config["union_controlnet"] = True
+            controlnet_config["union_controlnet_num_control_type"] = controlnet_data["task_embedding"].shape[0]
            for k in list(controlnet_data.keys()):
                new_k = k.replace('.attn.in_proj_', '.attn.in_proj.')
                new_sd[new_k] = controlnet_data.pop(k)
@@ -423,8 +556,15 @@ def load_controlnet(ckpt_path, model=None):
        if len(leftover_keys) > 0:
            logging.warning("leftover keys: {}".format(leftover_keys))
        controlnet_data = new_sd
-    elif "controlnet_blocks.0.weight" in controlnet_data: #SD3 diffusers format
-        return load_controlnet_mmdit(controlnet_data)
+    elif "controlnet_blocks.0.weight" in controlnet_data:
+        if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
+            return load_controlnet_flux_xlabs_mistoline(controlnet_data, model_options=model_options)
+        elif "pos_embed_input.proj.weight" in controlnet_data:
+            return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
+        elif "controlnet_x_embedder.weight" in controlnet_data:
+            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
+    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
+        return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)

    pth_key = 'control_model.zero_convs.0.0.weight'
    pth = False
@@ -436,26 +576,38 @@ def load_controlnet(ckpt_path, model=None):
    elif key in controlnet_data:
        prefix = ""
    else:
-        net = load_t2i_adapter(controlnet_data)
+        net = load_t2i_adapter(controlnet_data, model_options=model_options)
        if net is None:
-            logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
+            logging.error("error could not detect control model type.")
        return net

    if controlnet_config is None:
        model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
-        supported_inference_dtypes = model_config.supported_inference_dtypes
+        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
        controlnet_config = model_config.unet_config

+    unet_dtype = model_options.get("dtype", None)
+    if unet_dtype is None:
+        weight_dtype = comfy.utils.weight_dtype(controlnet_data)
+
+        if supported_inference_dtypes is None:
+            supported_inference_dtypes = [comfy.model_management.unet_dtype()]
+
+        if weight_dtype is not None:
+            supported_inference_dtypes.append(weight_dtype)
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)
+
    load_device = comfy.model_management.get_torch_device()
-    if supported_inference_dtypes is None:
-        unet_dtype = comfy.model_management.unet_dtype()
-    else:
-        unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)

    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
-    if manual_cast_dtype is not None:
-        controlnet_config["operations"] = comfy.ops.manual_cast
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype)
+
+    controlnet_config["operations"] = operations
    controlnet_config["dtype"] = unet_dtype
+    controlnet_config["device"] = comfy.model_management.unet_offload_device()
    controlnet_config.pop("out_channels")
    controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
    control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
@@ -489,22 +641,32 @@ def load_controlnet(ckpt_path, model=None):
    if len(unexpected) > 0:
        logging.debug("unexpected controlnet keys: {}".format(unexpected))

-    global_average_pooling = False
-    filename = os.path.splitext(ckpt_path)[0]
-    if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
-        global_average_pooling = True
-
+    global_average_pooling = model_options.get("global_average_pooling", False)
    control = ControlNet(control_model, global_average_pooling=global_average_pooling, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
    return control

+def load_controlnet(ckpt_path, model=None, model_options={}):
+    if "global_average_pooling" not in model_options:
+        filename = os.path.splitext(ckpt_path)[0]
+        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
+            model_options["global_average_pooling"] = True
+
+    cnet = load_controlnet_state_dict(comfy.utils.load_torch_file(ckpt_path, safe_load=True), model=model, model_options=model_options)
+    if cnet is None:
+        logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
+    return cnet
+
 class T2IAdapter(ControlBase):
    def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
-        super().__init__(device)
+        super().__init__()
        self.t2i_model = t2i_model
        self.channels_in = channels_in
        self.control_input = None
        self.compression_ratio = compression_ratio
        self.upscale_algorithm = upscale_algorithm
+        if device is None:
+            device = comfy.model_management.get_torch_device()
+        self.device = device

    def scale_image_to(self, width, height):
        unshuffle_amount = self.t2i_model.unshuffle_amount
@@ -552,7 +714,7 @@ class T2IAdapter(ControlBase):
        self.copy_to(c)
        return c

-def load_t2i_adapter(t2i_data):
+def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
    compression_ratio = 8
    upscale_algorithm = 'nearest-exact'

--- a/comfy/diffusers_load.py
+++ b/comfy/diffusers_load.py
@@ -22,7 +22,7 @@ def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_dire
    if text_encoder2_path is not None:
        text_encoder_paths.append(text_encoder2_path)

-    unet = comfy.sd.load_unet(unet_path)
+    unet = comfy.sd.load_diffusion_model(unet_path)

    clip = None
    if output_clip:
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@@ -16,7 +16,7 @@ class NoiseScheduleVP:
            continuous_beta_0=0.1,
            continuous_beta_1=20.,
        ):
-        """Create a wrapper class for the forward SDE (VP type).
+        r"""Create a wrapper class for the forward SDE (VP type).

        ***
        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
--- a/comfy/float.py
+++ b/comfy/float.py
@@ -0,0 +1,67 @@
+import torch
+
+def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
+    mantissa_scaled = torch.where(
+        normal_mask,
+        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
+        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
+    )
+
+    mantissa_scaled += torch.rand(mantissa_scaled.size(), dtype=mantissa_scaled.dtype, layout=mantissa_scaled.layout, device=mantissa_scaled.device, generator=generator)
+    return mantissa_scaled.floor() / (2**MANTISSA_BITS)
+
+#Not 100% sure about this
+def manual_stochastic_round_to_float8(x, dtype, generator=None):
+    if dtype == torch.float8_e4m3fn:
+        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 4, 3, 7
+    elif dtype == torch.float8_e5m2:
+        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 5, 2, 15
+    else:
+        raise ValueError("Unsupported dtype")
+
+    x = x.half()
+    sign = torch.sign(x)
+    abs_x = x.abs()
+    sign = torch.where(abs_x == 0, 0, sign)
+
+    # Combine exponent calculation and clamping
+    exponent = torch.clamp(
+        torch.floor(torch.log2(abs_x)) + EXPONENT_BIAS,
+        0, 2**EXPONENT_BITS - 1
+    )
+
+    # Combine mantissa calculation and rounding
+    normal_mask = ~(exponent == 0)
+
+    abs_x[:] = calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=generator)
+
+    sign *= torch.where(
+        normal_mask,
+        (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + abs_x),
+        (2.0 ** (-EXPONENT_BIAS + 1)) * abs_x
+    )
+
+    inf = torch.finfo(dtype)
+    torch.clamp(sign, min=inf.min, max=inf.max, out=sign)
+    return sign
+
+
+
+def stochastic_rounding(value, dtype, seed=0):
+    if dtype == torch.float32:
+        return value.to(dtype=torch.float32)
+    if dtype == torch.float16:
+        return value.to(dtype=torch.float16)
+    if dtype == torch.bfloat16:
+        return value.to(dtype=torch.bfloat16)
+    if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
+        generator = torch.Generator(device=value.device)
+        generator.manual_seed(seed)
+        output = torch.empty_like(value, dtype=dtype)
+        num_slices = max(1, (value.numel() / (4096 * 4096)))
+        slice_size = max(1, round(value.shape[0] / num_slices))
+        for i in range(0, value.shape[0], slice_size):
+            output[i:i+slice_size].copy_(manual_stochastic_round_to_float8(value[i:i+slice_size], dtype, generator=generator))
+        return output
+
+    return value.to(dtype=dtype)
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -9,6 +9,7 @@ from tqdm.auto import trange, tqdm
 from . import utils
 from . import deis
 import comfy.model_patcher
+import comfy.model_sampling

 def append_zero(x):
    return torch.cat([x, x.new_zeros([1])])
@@ -43,6 +44,17 @@ def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
    return append_zero(sigmas)


+def get_sigmas_laplace(n, sigma_min, sigma_max, mu=0., beta=0.5, device='cpu'):
+    """Constructs the noise schedule proposed by Tiankai et al. (2024). """
+    epsilon = 1e-5 # avoid log(0)
+    x = torch.linspace(0, 1, n, device=device)
+    clamp = lambda x: torch.clamp(x, min=sigma_min, max=sigma_max)
+    lmb = mu - beta * torch.sign(0.5-x) * torch.log(1 - 2 * torch.abs(0.5-x) + epsilon)
+    sigmas = clamp(torch.exp(lmb))
+    return sigmas
+
+
+
 def to_d(x, sigma, denoised):
    """Converts a denoiser output to a Karras ODE derivative."""
    return (x - denoised) / utils.append_dims(sigma, x.ndim)
@@ -152,6 +164,8 @@ def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None,

@torch.no_grad()
 def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
+    if isinstance(model.inner_model.inner_model.model_sampling, comfy.model_sampling.CONST):
+        return sample_euler_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
    """Ancestral sampling with Euler method steps."""
    extra_args = {} if extra_args is None else extra_args
    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
@@ -169,6 +183,29 @@ def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, dis
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x

+@torch.no_grad()
+def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1.0, s_noise=1., noise_sampler=None):
+    """Ancestral sampling with Euler method steps."""
+    extra_args = {} if extra_args is None else extra_args
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        # sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
+        downstep_ratio = 1 + (sigmas[i+1]/sigmas[i] - 1) * eta
+        sigma_down = sigmas[i+1] * downstep_ratio
+        alpha_ip1 = 1 - sigmas[i+1]
+        alpha_down = 1 - sigma_down
+        renoise_coeff = (sigmas[i+1]**2 - sigma_down**2*alpha_ip1**2/alpha_down**2)**0.5
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+
+        # Euler method
+        sigma_down_i_ratio = sigma_down / sigmas[i]
+        x = sigma_down_i_ratio * x + (1 - sigma_down_i_ratio) * denoised
+        if sigmas[i + 1] > 0 and eta > 0:
+            x = (alpha_ip1/alpha_down) * x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * renoise_coeff
+    return x

@torch.no_grad()
 def sample_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
@@ -509,6 +546,9 @@ def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callbac

@torch.no_grad()
 def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
+    if isinstance(model.inner_model.inner_model.model_sampling, comfy.model_sampling.CONST):
+        return sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
+
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
@@ -541,6 +581,55 @@ def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None,
    return x


+@torch.no_grad()
+def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
+    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
+    extra_args = {} if extra_args is None else extra_args
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+    sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
+    lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
+
+    # logged_x = x.unsqueeze(0)
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        downstep_ratio = 1 + (sigmas[i+1]/sigmas[i] - 1) * eta
+        sigma_down = sigmas[i+1] * downstep_ratio
+        alpha_ip1 = 1 - sigmas[i+1]
+        alpha_down = 1 - sigma_down
+        renoise_coeff = (sigmas[i+1]**2 - sigma_down**2*alpha_ip1**2/alpha_down**2)**0.5
+        # sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        if sigmas[i + 1] == 0:
+            # Euler method
+            d = to_d(x, sigmas[i], denoised)
+            dt = sigma_down - sigmas[i]
+            x = x + d * dt
+        else:
+            # DPM-Solver++(2S)
+            if sigmas[i] == 1.0:
+                sigma_s = 0.9999
+            else:
+                t_i, t_down = lambda_fn(sigmas[i]), lambda_fn(sigma_down)
+                r = 1 / 2
+                h = t_down - t_i
+                s = t_i + r * h
+                sigma_s = sigma_fn(s)
+            # sigma_s = sigmas[i+1]
+            sigma_s_i_ratio = sigma_s / sigmas[i]
+            u = sigma_s_i_ratio * x + (1 - sigma_s_i_ratio) * denoised
+            D_i = model(u, sigma_s * s_in, **extra_args)
+            sigma_down_i_ratio = sigma_down / sigmas[i]
+            x = sigma_down_i_ratio * x + (1 - sigma_down_i_ratio) * D_i
+            # print("sigma_i", sigmas[i], "sigma_ip1", sigmas[i+1],"sigma_down", sigma_down, "sigma_down_i_ratio", sigma_down_i_ratio, "sigma_s_i_ratio", sigma_s_i_ratio, "renoise_coeff", renoise_coeff)
+        # Noise addition
+        if sigmas[i + 1] > 0 and eta > 0:
+            x = (alpha_ip1/alpha_down) * x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * renoise_coeff
+        # logged_x = torch.cat((logged_x, x.unsqueeze(0)), dim=0)
+    return x
+
@torch.no_grad()
 def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    """DPM-Solver++ (stochastic)."""
@@ -1016,7 +1105,6 @@ def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disabl
        d = to_d(x, sigma_hat, temp[0])
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
-        dt = sigmas[i + 1] - sigma_hat
        # Euler method
        x = denoised + d * sigmas[i + 1]
    return x
@@ -1043,8 +1131,81 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        d = to_d(x, sigmas[i], temp[0])
        # Euler method
-        dt = sigma_down - sigmas[i]
        x = denoised + d * sigma_down
        if sigmas[i + 1] > 0:
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x
+@torch.no_grad()
+def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
+    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
+    extra_args = {} if extra_args is None else extra_args
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+
+    temp = [0]
+    def post_cfg_function(args):
+        temp[0] = args["uncond_denoised"]
+        return args["denoised"]
+
+    model_options = extra_args.get("model_options", {}).copy()
+    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
+
+    s_in = x.new_ones([x.shape[0]])
+    sigma_fn = lambda t: t.neg().exp()
+    t_fn = lambda sigma: sigma.log().neg()
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        if sigma_down == 0:
+            # Euler method
+            d = to_d(x, sigmas[i], temp[0])
+            x = denoised + d * sigma_down
+        else:
+            # DPM-Solver++(2S)
+            t, t_next = t_fn(sigmas[i]), t_fn(sigma_down)
+            # r = torch.sinh(1 + (2 - eta) * (t_next - t) / (t - t_fn(sigma_up))) works only on non-cfgpp, weird
+            r = 1 / 2
+            h = t_next - t
+            s = t + r * h
+            x_2 = (sigma_fn(s) / sigma_fn(t)) * (x + (denoised - temp[0])) - (-h * r).expm1() * denoised
+            denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
+            x = (sigma_fn(t_next) / sigma_fn(t)) * (x + (denoised - temp[0])) - (-h).expm1() * denoised_2
+        # Noise addition
+        if sigmas[i + 1] > 0:
+            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+    return x
+
+@torch.no_grad()
+def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
+    """DPM-Solver++(2M)."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    t_fn = lambda sigma: sigma.log().neg()
+
+    old_uncond_denoised = None
+    uncond_denoised = None
+    def post_cfg_function(args):
+        nonlocal uncond_denoised
+        uncond_denoised = args["uncond_denoised"]
+        return args["denoised"]
+    
+    model_options = extra_args.get("model_options", {}).copy()
+    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
+        h = t_next - t
+        if old_uncond_denoised is None or sigmas[i + 1] == 0:
+            denoised_mix = -torch.exp(-h) * uncond_denoised
+        else:
+            h_last = t - t_fn(sigmas[i - 1])
+            r = h_last / h
+            denoised_mix = -torch.exp(-h) * uncond_denoised - torch.expm1(-h) * (1 / (2 * r)) * (denoised - old_uncond_denoised)
+        x = denoised + denoised_mix + torch.exp(-h) * x
+        old_uncond_denoised = uncond_denoised
+    return x
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -4,6 +4,7 @@ class LatentFormat:
    scale_factor = 1.0
    latent_channels = 4
    latent_rgb_factors = None
+    latent_rgb_factors_bias = None
    taesd_decoder_name = None

    def process_in(self, latent):
@@ -30,11 +31,13 @@ class SDXL(LatentFormat):
    def __init__(self):
        self.latent_rgb_factors = [
                    #   R        G        B
-                    [ 0.3920,  0.4054,  0.4549],
-                    [-0.2634, -0.0196,  0.0653],
-                    [ 0.0568,  0.1687, -0.0755],
-                    [-0.3112, -0.2359, -0.2076]
+                    [ 0.3651,  0.4232,  0.4341],
+                    [-0.2533, -0.0042,  0.1068],
+                    [ 0.1076,  0.1111, -0.0362],
+                    [-0.3165, -0.2492, -0.2188]
                ]
+        self.latent_rgb_factors_bias = [ 0.1084, -0.0175, -0.0011]
+
        self.taesd_decoder_name = "taesdxl_decoder"

 class SDXL_Playground_2_5(LatentFormat):
@@ -112,23 +115,24 @@ class SD3(LatentFormat):
        self.scale_factor = 1.5305
        self.shift_factor = 0.0609
        self.latent_rgb_factors = [
-            [-0.0645,  0.0177,  0.1052],
-            [ 0.0028,  0.0312,  0.0650],
-            [ 0.1848,  0.0762,  0.0360],
-            [ 0.0944,  0.0360,  0.0889],
-            [ 0.0897,  0.0506, -0.0364],
-            [-0.0020,  0.1203,  0.0284],
-            [ 0.0855,  0.0118,  0.0283],
-            [-0.0539,  0.0658,  0.1047],
-            [-0.0057,  0.0116,  0.0700],
-            [-0.0412,  0.0281, -0.0039],
-            [ 0.1106,  0.1171,  0.1220],
-            [-0.0248,  0.0682, -0.0481],
-            [ 0.0815,  0.0846,  0.1207],
-            [-0.0120, -0.0055, -0.0867],
-            [-0.0749, -0.0634, -0.0456],
-            [-0.1418, -0.1457, -0.1259]
+            [-0.0922, -0.0175,  0.0749],
+            [ 0.0311,  0.0633,  0.0954],
+            [ 0.1994,  0.0927,  0.0458],
+            [ 0.0856,  0.0339,  0.0902],
+            [ 0.0587,  0.0272, -0.0496],
+            [-0.0006,  0.1104,  0.0309],
+            [ 0.0978,  0.0306,  0.0427],
+            [-0.0042,  0.1038,  0.1358],
+            [-0.0194,  0.0020,  0.0669],
+            [-0.0488,  0.0130, -0.0268],
+            [ 0.0922,  0.0988,  0.0951],
+            [-0.0278,  0.0524, -0.0542],
+            [ 0.0332,  0.0456,  0.0895],
+            [-0.0069, -0.0030, -0.0810],
+            [-0.0596, -0.0465, -0.0293],
+            [-0.1448, -0.1463, -0.1189]
        ]
+        self.latent_rgb_factors_bias = [0.2394, 0.2135, 0.1925]
        self.taesd_decoder_name = "taesd3_decoder"

    def process_in(self, latent):
@@ -139,3 +143,80 @@ class SD3(LatentFormat):

 class StableAudio1(LatentFormat):
    latent_channels = 64
+
+class Flux(SD3):
+    latent_channels = 16
+    def __init__(self):
+        self.scale_factor = 0.3611
+        self.shift_factor = 0.1159
+        self.latent_rgb_factors =[
+            [-0.0346,  0.0244,  0.0681],
+            [ 0.0034,  0.0210,  0.0687],
+            [ 0.0275, -0.0668, -0.0433],
+            [-0.0174,  0.0160,  0.0617],
+            [ 0.0859,  0.0721,  0.0329],
+            [ 0.0004,  0.0383,  0.0115],
+            [ 0.0405,  0.0861,  0.0915],
+            [-0.0236, -0.0185, -0.0259],
+            [-0.0245,  0.0250,  0.1180],
+            [ 0.1008,  0.0755, -0.0421],
+            [-0.0515,  0.0201,  0.0011],
+            [ 0.0428, -0.0012, -0.0036],
+            [ 0.0817,  0.0765,  0.0749],
+            [-0.1264, -0.0522, -0.1103],
+            [-0.0280, -0.0881, -0.0499],
+            [-0.1262, -0.0982, -0.0778]
+        ]
+        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
+        self.taesd_decoder_name = "taef1_decoder"
+
+    def process_in(self, latent):
+        return (latent - self.shift_factor) * self.scale_factor
+
+    def process_out(self, latent):
+        return (latent / self.scale_factor) + self.shift_factor
+
+class Mochi(LatentFormat):
+    latent_channels = 12
+
+    def __init__(self):
+        self.scale_factor = 1.0
+        self.latents_mean = torch.tensor([-0.06730895953510081, -0.038011381506090416, -0.07477820912866141,
+                                          -0.05565264470995561, 0.012767231469026969, -0.04703542746246419,
+                                          0.043896967884726704, -0.09346305707025976, -0.09918314763016893,
+                                          -0.008729793427399178, -0.011931556316503654, -0.0321993391887285]).view(1, self.latent_channels, 1, 1, 1)
+        self.latents_std = torch.tensor([0.9263795028493863, 0.9248894543193766, 0.9393059390890617,
+                                         0.959253732819592, 0.8244560132752793, 0.917259975397747,
+                                         0.9294154431013696, 1.3720942357788521, 0.881393668867029,
+                                         0.9168315692124348, 0.9185249279345552, 0.9274757570805041]).view(1, self.latent_channels, 1, 1, 1)
+
+        self.latent_rgb_factors =[
+            [-0.0069, -0.0045,  0.0018],
+            [ 0.0154, -0.0692, -0.0274],
+            [ 0.0333,  0.0019,  0.0206],
+            [-0.1390,  0.0628,  0.1678],
+            [-0.0725,  0.0134, -0.1898],
+            [ 0.0074, -0.0270, -0.0209],
+            [-0.0176, -0.0277, -0.0221],
+            [ 0.5294,  0.5204,  0.3852],
+            [-0.0326, -0.0446, -0.0143],
+            [-0.0659,  0.0153, -0.0153],
+            [ 0.0185, -0.0217,  0.0014],
+            [-0.0396, -0.0495, -0.0281]
+        ]
+        self.latent_rgb_factors_bias = [-0.0940, -0.1418, -0.1453]
+        self.taesd_decoder_name = None #TODO
+
+    def process_in(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return (latent - latents_mean) * self.scale_factor / latents_std
+
+    def process_out(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return latent * latents_std / self.scale_factor + latents_mean
+
+class LTXV(LatentFormat):
+    latent_channels = 128
+
--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@@ -9,6 +9,7 @@ from einops import rearrange
 from torch import nn
 from torch.nn import functional as F
 import math
+import comfy.ops

 class FourierFeatures(nn.Module):
    def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
@@ -18,7 +19,7 @@ class FourierFeatures(nn.Module):
            [out_features // 2, in_features], dtype=dtype, device=device))

    def forward(self, input):
-        f = 2 * math.pi * input @ self.weight.T.to(dtype=input.dtype, device=input.device)
+        f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
        return torch.cat([f.cos(), f.sin()], dim=-1)

 # norms
@@ -38,9 +39,9 @@ class LayerNorm(nn.Module):

    def forward(self, x):
        beta = self.beta
-        if self.beta is not None:
-            beta = beta.to(dtype=x.dtype, device=x.device)
-        return F.layer_norm(x, x.shape[-1:], weight=self.gamma.to(dtype=x.dtype, device=x.device), bias=beta)
+        if beta is not None:
+            beta = comfy.ops.cast_to_input(beta, x)
+        return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)

 class GLU(nn.Module):
    def __init__(
@@ -123,7 +124,9 @@ class RotaryEmbedding(nn.Module):
        scale_base = 512,
        interpolation_factor = 1.,
        base = 10000,
-        base_rescale_factor = 1.
+        base_rescale_factor = 1.,
+        dtype=None,
+        device=None,
    ):
        super().__init__()
        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
@@ -131,8 +134,8 @@ class RotaryEmbedding(nn.Module):
        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
        base *= base_rescale_factor ** (dim / (dim - 2))

-        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
+        # inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', torch.empty((dim // 2,), device=device, dtype=dtype))

        assert interpolation_factor >= 1.
        self.interpolation_factor = interpolation_factor
@@ -161,14 +164,14 @@ class RotaryEmbedding(nn.Module):

        t = t / self.interpolation_factor

-        freqs = torch.einsum('i , j -> i j', t, self.inv_freq.to(dtype=dtype, device=device))
+        freqs = torch.einsum('i , j -> i j', t, comfy.ops.cast_to_input(self.inv_freq, t))
        freqs = torch.cat((freqs, freqs), dim = -1)

        if self.scale is None:
            return freqs, 1.

        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
-        scale = self.scale.to(dtype=dtype, device=device) ** rearrange(power, 'n -> n 1')
+        scale = comfy.ops.cast_to_input(self.scale, t) ** rearrange(power, 'n -> n 1')
        scale = torch.cat((scale, scale), dim = -1)

        return freqs, scale
@@ -568,7 +571,7 @@ class ContinuousTransformer(nn.Module):
        self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()

        if rotary_pos_emb:
-            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32))
+            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32), device=device, dtype=dtype)
        else:
            self.rotary_pos_emb = None

@@ -609,7 +612,9 @@ class ContinuousTransformer(nn.Module):
        return_info = False,
        **kwargs
    ):
+        patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {})
        batch, seq, device = *x.shape[:2], x.device
+        context = kwargs["context"]

        info = {
            "hidden_states": [],
@@ -640,9 +645,19 @@ class ContinuousTransformer(nn.Module):
        if self.use_sinusoidal_emb or self.use_abs_pos_emb:
            x = x + self.pos_emb(x)

+        blocks_replace = patches_replace.get("dit", {})
        # Iterate over the transformer layers
-        for layer in self.layers:
-            x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
+        for i, layer in enumerate(self.layers):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"])
+                    return out
+
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap})
+                x = out["img"]
+            else:
+                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context)
            # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)

            if return_info:
@@ -871,7 +886,6 @@ class AudioDiffusionTransformer(nn.Module):
        mask=None,
        return_info=False,
        control=None,
-        transformer_options={},
        **kwargs):
            return self._forward(
                x,
--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@@ -8,6 +8,8 @@ import torch.nn as nn
 import torch.nn.functional as F

 from comfy.ldm.modules.attention import optimized_attention
+import comfy.ops
+import comfy.ldm.common_dit

 def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
@@ -406,10 +408,7 @@ class MMDiT(nn.Module):

    def patchify(self, x):
        B, C, H, W = x.size()
-        pad_h = (self.patch_size - H % self.patch_size) % self.patch_size
-        pad_w = (self.patch_size - W % self.patch_size) % self.patch_size
-
-        x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
        x = x.view(
            B,
            C,
@@ -427,7 +426,7 @@ class MMDiT(nn.Module):
        max_dim = max(h, w)

        cur_dim = self.h_max
-        pos_encoding = self.positional_encoding.reshape(1, cur_dim, cur_dim, -1).to(device=x.device, dtype=x.dtype)
+        pos_encoding = comfy.ops.cast_to_input(self.positional_encoding.reshape(1, cur_dim, cur_dim, -1), x)

        if max_dim > cur_dim:
            pos_encoding = F.interpolate(pos_encoding.movedim(-1, 1), (max_dim, max_dim), mode="bilinear").movedim(1, -1)
@@ -438,7 +437,8 @@ class MMDiT(nn.Module):
        pos_encoding = pos_encoding[:,from_h:from_h+h,from_w:from_w+w]
        return x + pos_encoding.reshape(1, -1, self.positional_encoding.shape[-1])

-    def forward(self, x, timestep, context, **kwargs):
+    def forward(self, x, timestep, context, transformer_options={}, **kwargs):
+        patches_replace = transformer_options.get("patches_replace", {})
        # patchify x, add PE
        b, c, h, w = x.shape

@@ -455,19 +455,40 @@ class MMDiT(nn.Module):
        t = timestep

        c = self.cond_seq_linear(c_seq)  # B, T_c, D
-        c = torch.cat([self.register_tokens.to(device=c.device, dtype=c.dtype).repeat(c.size(0), 1, 1), c], dim=1)
+        c = torch.cat([comfy.ops.cast_to_input(self.register_tokens, c).repeat(c.size(0), 1, 1), c], dim=1)

        global_cond = self.t_embedder(t, x.dtype)  # B, D

+        blocks_replace = patches_replace.get("dit", {})
        if len(self.double_layers) > 0:
-            for layer in self.double_layers:
-                c, x = layer(c, x, global_cond, **kwargs)
+            for i, layer in enumerate(self.double_layers):
+                if ("double_block", i) in blocks_replace:
+                    def block_wrap(args):
+                        out = {}
+                        out["txt"], out["img"] = layer(args["txt"],
+                                                       args["img"],
+                                                       args["vec"])
+                        return out
+                    out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond}, {"original_block": block_wrap})
+                    c = out["txt"]
+                    x = out["img"]
+                else:
+                    c, x = layer(c, x, global_cond, **kwargs)

        if len(self.single_layers) > 0:
            c_len = c.size(1)
            cx = torch.cat([c, x], dim=1)
-            for layer in self.single_layers:
-                cx = layer(cx, global_cond, **kwargs)
+            for i, layer in enumerate(self.single_layers):
+                if ("single_block", i) in blocks_replace:
+                    def block_wrap(args):
+                        out = {}
+                        out["img"] = layer(args["img"], args["vec"])
+                        return out
+
+                    out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond}, {"original_block": block_wrap})
+                    cx = out["img"]
+                else:
+                    cx = layer(cx, global_cond, **kwargs)

            x = cx[:, c_len:]

--- a/comfy/ldm/cascade/common.py
+++ b/comfy/ldm/cascade/common.py
@@ -19,14 +19,7 @@
 import torch
 import torch.nn as nn
 from comfy.ldm.modules.attention import optimized_attention
-
-class Linear(torch.nn.Linear):
-    def reset_parameters(self):
-        return None
-
-class Conv2d(torch.nn.Conv2d):
-    def reset_parameters(self):
-        return None
+import comfy.ops

 class OptimizedAttention(nn.Module):
    def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
@@ -78,13 +71,13 @@ class GlobalResponseNorm(nn.Module):
    "from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105"
    def __init__(self, dim, dtype=None, device=None):
        super().__init__()
-        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
-        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
+        self.gamma = nn.Parameter(torch.empty(1, 1, 1, dim, dtype=dtype, device=device))
+        self.beta = nn.Parameter(torch.empty(1, 1, 1, dim, dtype=dtype, device=device))

    def forward(self, x):
        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
-        return self.gamma.to(device=x.device, dtype=x.dtype) * (x * Nx) + self.beta.to(device=x.device, dtype=x.dtype) + x
+        return comfy.ops.cast_to_input(self.gamma, x) * (x * Nx) + comfy.ops.cast_to_input(self.beta, x) + x


 class ResBlock(nn.Module):
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@@ -0,0 +1,27 @@
+import torch
+import comfy.ops
+
+def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
+    if padding_mode == "circular" and torch.jit.is_tracing() or torch.jit.is_scripting():
+        padding_mode = "reflect"
+    pad_h = (patch_size[0] - img.shape[-2] % patch_size[0]) % patch_size[0]
+    pad_w = (patch_size[1] - img.shape[-1] % patch_size[1]) % patch_size[1]
+    return torch.nn.functional.pad(img, (0, pad_w, 0, pad_h), mode=padding_mode)
+
+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+except:
+    rms_norm_torch = None
+
+def rms_norm(x, weight=None, eps=1e-6):
+    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        if weight is None:
+            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
+        else:
+            return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        if weight is None:
+            return r
+        else:
+            return r * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
--- a/comfy/ldm/flux/controlnet.py
+++ b/comfy/ldm/flux/controlnet.py
@@ -0,0 +1,205 @@
+#Original code can be found on: https://github.com/XLabs-AI/x-flux/blob/main/src/flux/controlnet.py
+#modified to support different types of flux controlnets
+
+import torch
+import math
+from torch import Tensor, nn
+from einops import rearrange, repeat
+
+from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
+                                 MLPEmbedder, SingleStreamBlock,
+                                 timestep_embedding)
+
+from .model import Flux
+import comfy.ldm.common_dit
+
+class MistolineCondDownsamplBlock(nn.Module):
+    def __init__(self, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
+        )
+
+    def forward(self, x):
+        return self.encoder(x)
+
+class MistolineControlnetBlock(nn.Module):
+    def __init__(self, hidden_size, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.linear = operations.Linear(hidden_size, hidden_size, dtype=dtype, device=device)
+        self.act = nn.SiLU()
+
+    def forward(self, x):
+        return self.act(self.linear(x))
+
+
+class ControlNetFlux(Flux):
+    def __init__(self, latent_input=False, num_union_modes=0, mistoline=False, control_latent_channels=None, image_model=None, dtype=None, device=None, operations=None, **kwargs):
+        super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs)
+
+        self.main_model_double = 19
+        self.main_model_single = 38
+
+        self.mistoline = mistoline
+        # add ControlNet blocks
+        if self.mistoline:
+            control_block = lambda : MistolineControlnetBlock(self.hidden_size, dtype=dtype, device=device, operations=operations)
+        else:
+            control_block = lambda : operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
+
+        self.controlnet_blocks = nn.ModuleList([])
+        for _ in range(self.params.depth):
+            self.controlnet_blocks.append(control_block())
+
+        self.controlnet_single_blocks = nn.ModuleList([])
+        for _ in range(self.params.depth_single_blocks):
+            self.controlnet_single_blocks.append(control_block())
+
+        self.num_union_modes = num_union_modes
+        self.controlnet_mode_embedder = None
+        if self.num_union_modes > 0:
+            self.controlnet_mode_embedder = operations.Embedding(self.num_union_modes, self.hidden_size, dtype=dtype, device=device)
+
+        self.gradient_checkpointing = False
+        self.latent_input = latent_input
+        if control_latent_channels is None:
+            control_latent_channels = self.in_channels
+        else:
+            control_latent_channels *= 2 * 2 #patch size
+
+        self.pos_embed_input = operations.Linear(control_latent_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
+        if not self.latent_input:
+            if self.mistoline:
+                self.input_cond_block = MistolineCondDownsamplBlock(dtype=dtype, device=device, operations=operations)
+            else:
+                self.input_hint_block = nn.Sequential(
+                    operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
+                    nn.SiLU(),
+                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
+                )
+
+    def forward_orig(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        controlnet_cond: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor = None,
+        control_type: Tensor = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+
+        controlnet_cond = self.pos_embed_input(controlnet_cond)
+        img = img + controlnet_cond
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+
+        if self.controlnet_mode_embedder is not None and len(control_type) > 0:
+            control_cond = self.controlnet_mode_embedder(torch.tensor(control_type, device=img.device), out_dtype=img.dtype).unsqueeze(0).repeat((txt.shape[0], 1, 1))
+            txt = torch.cat([control_cond, txt], dim=1)
+            txt_ids = torch.cat([txt_ids[:,:1], txt_ids], dim=1)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        controlnet_double = ()
+
+        for i in range(len(self.double_blocks)):
+            img, txt = self.double_blocks[i](img=img, txt=txt, vec=vec, pe=pe)
+            controlnet_double = controlnet_double + (self.controlnet_blocks[i](img),)
+
+        img = torch.cat((txt, img), 1)
+
+        controlnet_single = ()
+
+        for i in range(len(self.single_blocks)):
+            img = self.single_blocks[i](img, vec=vec, pe=pe)
+            controlnet_single = controlnet_single + (self.controlnet_single_blocks[i](img[:, txt.shape[1] :, ...]),)
+
+        repeat = math.ceil(self.main_model_double / len(controlnet_double))
+        if self.latent_input:
+            out_input = ()
+            for x in controlnet_double:
+                    out_input += (x,) * repeat
+        else:
+            out_input = (controlnet_double * repeat)
+
+        out = {"input": out_input[:self.main_model_double]}
+        if len(controlnet_single) > 0:
+            repeat = math.ceil(self.main_model_single / len(controlnet_single))
+            out_output = ()
+            if self.latent_input:
+                for x in controlnet_single:
+                        out_output += (x,) * repeat
+            else:
+                out_output = (controlnet_single * repeat)
+            out["output"] = out_output[:self.main_model_single]
+        return out
+
+    def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
+        patch_size = 2
+        if self.latent_input:
+            hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
+        elif self.mistoline:
+            hint = hint * 2.0 - 1.0
+            hint = self.input_cond_block(hint)
+        else:
+            hint = hint * 2.0 - 1.0
+            hint = self.input_hint_block(hint)
+
+        hint = rearrange(hint, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+
+        bs, c, h, w = x.shape
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
+
+        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+
+        h_len = ((h + (patch_size // 2)) // patch_size)
+        w_len = ((w + (patch_size // 2)) // patch_size)
+        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids[..., 1] = img_ids[..., 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype)[:, None]
+        img_ids[..., 2] = img_ids[..., 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype)[None, :]
+        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+
+        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        return self.forward_orig(img, img_ids, hint, context, txt_ids, timesteps, y, guidance, control_type=kwargs.get("control_type", []))
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -0,0 +1,249 @@
+import math
+from dataclasses import dataclass
+
+import torch
+from torch import Tensor, nn
+
+from .math import attention, rope
+import comfy.ops
+import comfy.ldm.common_dit
+
+
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+
+        return emb.unsqueeze(1)
+
+
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.silu = nn.SiLU()
+        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
+
+    def forward(self, x: Tensor):
+        return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6)
+
+
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.query_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
+        self.key_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+
+        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
+        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
+
+    def forward(self, vec: Tensor) -> tuple:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+
+
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+        super().__init__()
+
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+
+        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.img_mlp = nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+
+        self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+
+        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.txt_mlp = nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+
+        # run actual attention
+        attn = attention(torch.cat((txt_q, img_q), dim=2),
+                         torch.cat((txt_k, img_k), dim=2),
+                         torch.cat((txt_v, img_v), dim=2), pe=pe)
+
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+
+        # calculate the txt bloks
+        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+
+        if txt.dtype == torch.float16:
+            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
+
+        return img, txt
+
+
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float = None,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
+        # proj and mlp_out
+        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
+
+        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+
+        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k = self.norm(q, k, v)
+
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        x += mod.gate * output
+        if x.dtype == torch.float16:
+            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
+        return x
+
+
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
+
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -0,0 +1,35 @@
+import torch
+from einops import rearrange
+from torch import Tensor
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.model_management
+
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+
+    heads = q.shape[1]
+    x = optimized_attention(q, k, v, heads, skip_reshape=True)
+    return x
+
+
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu():
+        device = torch.device("cpu")
+    else:
+        device = pos.device
+
+    scale = torch.linspace(0, (dim - 2) / dim, steps=dim//2, dtype=torch.float64, device=device)
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos.to(dtype=torch.float32, device=device), omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.to(dtype=torch.float32, device=pos.device)
+
+
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -0,0 +1,185 @@
+#Original code can be found on: https://github.com/black-forest-labs/flux
+
+from dataclasses import dataclass
+
+import torch
+from torch import Tensor, nn
+
+from .layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+
+from einops import rearrange, repeat
+import comfy.ldm.common_dit
+
+@dataclass
+class FluxParams:
+    in_channels: int
+    out_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list
+    theta: int
+    patch_size: int
+    qkv_bias: bool
+    guidance_embed: bool
+
+
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
+        super().__init__()
+        self.dtype = dtype
+        params = FluxParams(**kwargs)
+        self.params = params
+        self.patch_size = params.patch_size
+        self.in_channels = params.in_channels * params.patch_size * params.patch_size
+        self.out_channels = params.out_channels * params.patch_size * params.patch_size
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
+        )
+        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+
+        if final_layer:
+            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
+
+    def forward_orig(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor = None,
+        control=None,
+        transformer_options={},
+    ) -> Tensor:
+        patches_replace = transformer_options.get("patches_replace", {})
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256).to(img.dtype))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
+
+        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
+        txt = self.txt_in(txt)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.double_blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"])
+                    return out
+
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe}, {"original_block": block_wrap})
+                txt = out["txt"]
+                img = out["img"]
+            else:
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+
+            if control is not None: # Controlnet
+                control_i = control.get("input")
+                if i < len(control_i):
+                    add = control_i[i]
+                    if add is not None:
+                        img += add
+
+        img = torch.cat((txt, img), 1)
+
+        for i, block in enumerate(self.single_blocks):
+            if ("single_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"])
+                    return out
+
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe}, {"original_block": block_wrap})
+                img = out["img"]
+            else:
+                img = block(img, vec=vec, pe=pe)
+
+            if control is not None: # Controlnet
+                control_o = control.get("output")
+                if i < len(control_o):
+                    add = control_o[i]
+                    if add is not None:
+                        img[:, txt.shape[1] :, ...] += add
+
+        img = img[:, txt.shape[1] :, ...]
+
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+
+    def forward(self, x, timestep, context, y, guidance, control=None, transformer_options={}, **kwargs):
+        bs, c, h, w = x.shape
+        patch_size = self.patch_size
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
+
+        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+
+        h_len = ((h + (patch_size // 2)) // patch_size)
+        w_len = ((w + (patch_size // 2)) // patch_size)
+        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+
+        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options)
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/flux/redux.py
+++ b/comfy/ldm/flux/redux.py
@@ -0,0 +1,25 @@
+import torch
+import comfy.ops
+
+ops = comfy.ops.manual_cast
+
+class ReduxImageEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        redux_dim: int = 1152,
+        txt_in_features: int = 4096,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__()
+
+        self.redux_dim = redux_dim
+        self.device = device
+        self.dtype = dtype
+
+        self.redux_up = ops.Linear(redux_dim, txt_in_features * 3, dtype=dtype)
+        self.redux_down = ops.Linear(txt_in_features * 3, txt_in_features, dtype=dtype)
+
+    def forward(self, sigclip_embeds) -> torch.Tensor:
+        projected_x = self.redux_down(torch.nn.functional.silu(self.redux_up(sigclip_embeds)))
+        return projected_x
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@@ -0,0 +1,559 @@
+#original code from https://github.com/genmoai/models under apache 2.0 license
+#adapted to ComfyUI
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+# from flash_attn import flash_attn_varlen_qkvpacked_func
+from comfy.ldm.modules.attention import optimized_attention
+
+from .layers import (
+    FeedForward,
+    PatchEmbed,
+    RMSNorm,
+    TimestepEmbedder,
+)
+
+from .rope_mixed import (
+    compute_mixed_rotation,
+    create_position_matrix,
+)
+from .temporal_rope import apply_rotary_emb_qk_real
+from .utils import (
+    AttentionPool,
+    modulate,
+)
+
+import comfy.ldm.common_dit
+import comfy.ops
+
+
+def modulated_rmsnorm(x, scale, eps=1e-6):
+    # Normalize and modulate
+    x_normed = comfy.ldm.common_dit.rms_norm(x, eps=eps)
+    x_modulated = x_normed * (1 + scale.unsqueeze(1))
+
+    return x_modulated
+
+
+def residual_tanh_gated_rmsnorm(x, x_res, gate, eps=1e-6):
+    # Apply tanh to gate
+    tanh_gate = torch.tanh(gate).unsqueeze(1)
+
+    # Normalize and apply gated scaling
+    x_normed = comfy.ldm.common_dit.rms_norm(x_res, eps=eps) * tanh_gate
+
+    # Apply residual connection
+    output = x + x_normed
+
+    return output
+
+class AsymmetricAttention(nn.Module):
+    def __init__(
+        self,
+        dim_x: int,
+        dim_y: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        update_y: bool = True,
+        out_bias: bool = True,
+        attend_to_padding: bool = False,
+        softmax_scale: Optional[float] = None,
+        device: Optional[torch.device] = None,
+        dtype=None,
+        operations=None,
+    ):
+        super().__init__()
+        self.dim_x = dim_x
+        self.dim_y = dim_y
+        self.num_heads = num_heads
+        self.head_dim = dim_x // num_heads
+        self.attn_drop = attn_drop
+        self.update_y = update_y
+        self.attend_to_padding = attend_to_padding
+        self.softmax_scale = softmax_scale
+        if dim_x % num_heads != 0:
+            raise ValueError(
+                f"dim_x={dim_x} should be divisible by num_heads={num_heads}"
+            )
+
+        # Input layers.
+        self.qkv_bias = qkv_bias
+        self.qkv_x = operations.Linear(dim_x, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)
+        # Project text features to match visual features (dim_y -> dim_x)
+        self.qkv_y = operations.Linear(dim_y, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)
+
+        # Query and key normalization for stability.
+        assert qk_norm
+        self.q_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.k_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.q_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.k_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
+
+        # Output layers. y features go back down from dim_x -> dim_y.
+        self.proj_x = operations.Linear(dim_x, dim_x, bias=out_bias, device=device, dtype=dtype)
+        self.proj_y = (
+            operations.Linear(dim_x, dim_y, bias=out_bias, device=device, dtype=dtype)
+            if update_y
+            else nn.Identity()
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,  # (B, N, dim_x)
+        y: torch.Tensor,  # (B, L, dim_y)
+        scale_x: torch.Tensor,  # (B, dim_x), modulation for pre-RMSNorm.
+        scale_y: torch.Tensor,  # (B, dim_y), modulation for pre-RMSNorm.
+        crop_y,
+        **rope_rotation,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        rope_cos = rope_rotation.get("rope_cos")
+        rope_sin = rope_rotation.get("rope_sin")
+        # Pre-norm for visual features
+        x = modulated_rmsnorm(x, scale_x)  # (B, M, dim_x) where M = N / cp_group_size
+
+        # Process visual features
+        # qkv_x = self.qkv_x(x)  # (B, M, 3 * dim_x)
+        # assert qkv_x.dtype == torch.bfloat16
+        # qkv_x = all_to_all_collect_tokens(
+        #     qkv_x, self.num_heads
+        # )  # (3, B, N, local_h, head_dim)
+
+        # Process text features
+        y = modulated_rmsnorm(y, scale_y)  # (B, L, dim_y)
+        q_y, k_y, v_y = self.qkv_y(y).view(y.shape[0], y.shape[1], 3, self.num_heads, -1).unbind(2)  # (B, N, local_h, head_dim)
+
+        q_y = self.q_norm_y(q_y)
+        k_y = self.k_norm_y(k_y)
+
+        # Split qkv_x into q, k, v
+        q_x, k_x, v_x = self.qkv_x(x).view(x.shape[0], x.shape[1], 3, self.num_heads, -1).unbind(2)  # (B, N, local_h, head_dim)
+        q_x = self.q_norm_x(q_x)
+        q_x = apply_rotary_emb_qk_real(q_x, rope_cos, rope_sin)
+        k_x = self.k_norm_x(k_x)
+        k_x = apply_rotary_emb_qk_real(k_x, rope_cos, rope_sin)
+
+        q = torch.cat([q_x, q_y[:, :crop_y]], dim=1).transpose(1, 2)
+        k = torch.cat([k_x, k_y[:, :crop_y]], dim=1).transpose(1, 2)
+        v = torch.cat([v_x, v_y[:, :crop_y]], dim=1).transpose(1, 2)
+
+        xy = optimized_attention(q,
+                                 k,
+                                 v, self.num_heads, skip_reshape=True)
+
+        x, y = torch.tensor_split(xy, (q_x.shape[1],), dim=1)
+        x = self.proj_x(x)
+        o = torch.zeros(y.shape[0], q_y.shape[1], y.shape[-1], device=y.device, dtype=y.dtype)
+        o[:, :y.shape[1]] = y
+
+        y = self.proj_y(o)
+        # print("ox", x)
+        # print("oy", y)
+        return x, y
+
+
+class AsymmetricJointBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size_x: int,
+        hidden_size_y: int,
+        num_heads: int,
+        *,
+        mlp_ratio_x: float = 8.0,  # Ratio of hidden size to d_model for MLP for visual tokens.
+        mlp_ratio_y: float = 4.0,  # Ratio of hidden size to d_model for MLP for text tokens.
+        update_y: bool = True,  # Whether to update text tokens in this block.
+        device: Optional[torch.device] = None,
+        dtype=None,
+        operations=None,
+        **block_kwargs,
+    ):
+        super().__init__()
+        self.update_y = update_y
+        self.hidden_size_x = hidden_size_x
+        self.hidden_size_y = hidden_size_y
+        self.mod_x = operations.Linear(hidden_size_x, 4 * hidden_size_x, device=device, dtype=dtype)
+        if self.update_y:
+            self.mod_y = operations.Linear(hidden_size_x, 4 * hidden_size_y, device=device, dtype=dtype)
+        else:
+            self.mod_y = operations.Linear(hidden_size_x, hidden_size_y, device=device, dtype=dtype)
+
+        # Self-attention:
+        self.attn = AsymmetricAttention(
+            hidden_size_x,
+            hidden_size_y,
+            num_heads=num_heads,
+            update_y=update_y,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+            **block_kwargs,
+        )
+
+        # MLP.
+        mlp_hidden_dim_x = int(hidden_size_x * mlp_ratio_x)
+        assert mlp_hidden_dim_x == int(1536 * 8)
+        self.mlp_x = FeedForward(
+            in_features=hidden_size_x,
+            hidden_size=mlp_hidden_dim_x,
+            multiple_of=256,
+            ffn_dim_multiplier=None,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+        # MLP for text not needed in last block.
+        if self.update_y:
+            mlp_hidden_dim_y = int(hidden_size_y * mlp_ratio_y)
+            self.mlp_y = FeedForward(
+                in_features=hidden_size_y,
+                hidden_size=mlp_hidden_dim_y,
+                multiple_of=256,
+                ffn_dim_multiplier=None,
+                device=device,
+                dtype=dtype,
+                operations=operations,
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,
+        y: torch.Tensor,
+        **attn_kwargs,
+    ):
+        """Forward pass of a block.
+
+        Args:
+            x: (B, N, dim) tensor of visual tokens
+            c: (B, dim) tensor of conditioned features
+            y: (B, L, dim) tensor of text tokens
+            num_frames: Number of frames in the video. N = num_frames * num_spatial_tokens
+
+        Returns:
+            x: (B, N, dim) tensor of visual tokens after block
+            y: (B, L, dim) tensor of text tokens after block
+        """
+        N = x.size(1)
+
+        c = F.silu(c)
+        mod_x = self.mod_x(c)
+        scale_msa_x, gate_msa_x, scale_mlp_x, gate_mlp_x = mod_x.chunk(4, dim=1)
+
+        mod_y = self.mod_y(c)
+        if self.update_y:
+            scale_msa_y, gate_msa_y, scale_mlp_y, gate_mlp_y = mod_y.chunk(4, dim=1)
+        else:
+            scale_msa_y = mod_y
+
+        # Self-attention block.
+        x_attn, y_attn = self.attn(
+            x,
+            y,
+            scale_x=scale_msa_x,
+            scale_y=scale_msa_y,
+            **attn_kwargs,
+        )
+
+        assert x_attn.size(1) == N
+        x = residual_tanh_gated_rmsnorm(x, x_attn, gate_msa_x)
+        if self.update_y:
+            y = residual_tanh_gated_rmsnorm(y, y_attn, gate_msa_y)
+
+        # MLP block.
+        x = self.ff_block_x(x, scale_mlp_x, gate_mlp_x)
+        if self.update_y:
+            y = self.ff_block_y(y, scale_mlp_y, gate_mlp_y)
+
+        return x, y
+
+    def ff_block_x(self, x, scale_x, gate_x):
+        x_mod = modulated_rmsnorm(x, scale_x)
+        x_res = self.mlp_x(x_mod)
+        x = residual_tanh_gated_rmsnorm(x, x_res, gate_x)  # Sandwich norm
+        return x
+
+    def ff_block_y(self, y, scale_y, gate_y):
+        y_mod = modulated_rmsnorm(y, scale_y)
+        y_res = self.mlp_y(y_mod)
+        y = residual_tanh_gated_rmsnorm(y, y_res, gate_y)  # Sandwich norm
+        return y
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        patch_size,
+        out_channels,
+        device: Optional[torch.device] = None,
+        dtype=None,
+        operations=None,
+    ):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(
+            hidden_size, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype
+        )
+        self.mod = operations.Linear(hidden_size, 2 * hidden_size, device=device, dtype=dtype)
+        self.linear = operations.Linear(
+            hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype
+        )
+
+    def forward(self, x, c):
+        c = F.silu(c)
+        shift, scale = self.mod(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class AsymmDiTJoint(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+
+    Ingests text embeddings instead of a label.
+    """
+
+    def __init__(
+        self,
+        *,
+        patch_size=2,
+        in_channels=4,
+        hidden_size_x=1152,
+        hidden_size_y=1152,
+        depth=48,
+        num_heads=16,
+        mlp_ratio_x=8.0,
+        mlp_ratio_y=4.0,
+        use_t5: bool = False,
+        t5_feat_dim: int = 4096,
+        t5_token_length: int = 256,
+        learn_sigma=True,
+        patch_embed_bias: bool = True,
+        timestep_mlp_bias: bool = True,
+        attend_to_padding: bool = False,
+        timestep_scale: Optional[float] = None,
+        use_extended_posenc: bool = False,
+        posenc_preserve_area: bool = False,
+        rope_theta: float = 10000.0,
+        image_model=None,
+        device: Optional[torch.device] = None,
+        dtype=None,
+        operations=None,
+        **block_kwargs,
+    ):
+        super().__init__()
+
+        self.dtype = dtype
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.hidden_size_x = hidden_size_x
+        self.hidden_size_y = hidden_size_y
+        self.head_dim = (
+            hidden_size_x // num_heads
+        )  # Head dimension and count is determined by visual.
+        self.attend_to_padding = attend_to_padding
+        self.use_extended_posenc = use_extended_posenc
+        self.posenc_preserve_area = posenc_preserve_area
+        self.use_t5 = use_t5
+        self.t5_token_length = t5_token_length
+        self.t5_feat_dim = t5_feat_dim
+        self.rope_theta = (
+            rope_theta  # Scaling factor for frequency computation for temporal RoPE.
+        )
+
+        self.x_embedder = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_channels,
+            embed_dim=hidden_size_x,
+            bias=patch_embed_bias,
+            dtype=dtype,
+            device=device,
+            operations=operations
+        )
+        # Conditionings
+        # Timestep
+        self.t_embedder = TimestepEmbedder(
+            hidden_size_x, bias=timestep_mlp_bias, timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
+        )
+
+        if self.use_t5:
+            # Caption Pooling (T5)
+            self.t5_y_embedder = AttentionPool(
+                t5_feat_dim, num_heads=8, output_dim=hidden_size_x, dtype=dtype, device=device, operations=operations
+            )
+
+            # Dense Embedding Projection (T5)
+            self.t5_yproj = operations.Linear(
+                t5_feat_dim, hidden_size_y, bias=True, dtype=dtype, device=device
+            )
+
+        # Initialize pos_frequencies as an empty parameter.
+        self.pos_frequencies = nn.Parameter(
+            torch.empty(3, self.num_heads, self.head_dim // 2, dtype=dtype, device=device)
+        )
+
+        assert not self.attend_to_padding
+
+        # for depth 48:
+        #  b =  0: AsymmetricJointBlock, update_y=True
+        #  b =  1: AsymmetricJointBlock, update_y=True
+        #  ...
+        #  b = 46: AsymmetricJointBlock, update_y=True
+        #  b = 47: AsymmetricJointBlock, update_y=False. No need to update text features.
+        blocks = []
+        for b in range(depth):
+            # Joint multi-modal block
+            update_y = b < depth - 1
+            block = AsymmetricJointBlock(
+                hidden_size_x,
+                hidden_size_y,
+                num_heads,
+                mlp_ratio_x=mlp_ratio_x,
+                mlp_ratio_y=mlp_ratio_y,
+                update_y=update_y,
+                attend_to_padding=attend_to_padding,
+                device=device,
+                dtype=dtype,
+                operations=operations,
+                **block_kwargs,
+            )
+
+            blocks.append(block)
+        self.blocks = nn.ModuleList(blocks)
+
+        self.final_layer = FinalLayer(
+            hidden_size_x, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations
+        )
+
+    def embed_x(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (B, C=12, T, H, W) tensor of visual tokens
+
+        Returns:
+            x: (B, C=3072, N) tensor of visual tokens with positional embedding.
+        """
+        return self.x_embedder(x)  # Convert BcTHW to BCN
+
+    def prepare(
+        self,
+        x: torch.Tensor,
+        sigma: torch.Tensor,
+        t5_feat: torch.Tensor,
+        t5_mask: torch.Tensor,
+    ):
+        """Prepare input and conditioning embeddings."""
+        # Visual patch embeddings with positional encoding.
+        T, H, W = x.shape[-3:]
+        pH, pW = H // self.patch_size, W // self.patch_size
+        x = self.embed_x(x)  # (B, N, D), where N = T * H * W / patch_size ** 2
+        assert x.ndim == 3
+        B = x.size(0)
+
+
+        pH, pW = H // self.patch_size, W // self.patch_size
+        N = T * pH * pW
+        assert x.size(1) == N
+        pos = create_position_matrix(
+            T, pH=pH, pW=pW, device=x.device, dtype=torch.float32
+        )  # (N, 3)
+        rope_cos, rope_sin = compute_mixed_rotation(
+            freqs=comfy.ops.cast_to(self.pos_frequencies, dtype=x.dtype, device=x.device), pos=pos
+        )  # Each are (N, num_heads, dim // 2)
+
+        c_t = self.t_embedder(1 - sigma, out_dtype=x.dtype)  # (B, D)
+
+        t5_y_pool = self.t5_y_embedder(t5_feat, t5_mask)  # (B, D)
+
+        c = c_t + t5_y_pool
+
+        y_feat = self.t5_yproj(t5_feat)  # (B, L, t5_feat_dim) --> (B, L, D)
+
+        return x, c, y_feat, rope_cos, rope_sin
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: torch.Tensor,
+        context: List[torch.Tensor],
+        attention_mask: List[torch.Tensor],
+        num_tokens=256,
+        packed_indices: Dict[str, torch.Tensor] = None,
+        rope_cos: torch.Tensor = None,
+        rope_sin: torch.Tensor = None,
+        control=None, transformer_options={}, **kwargs
+    ):
+        patches_replace = transformer_options.get("patches_replace", {})
+        y_feat = context
+        y_mask = attention_mask
+        sigma = timestep
+        """Forward pass of DiT.
+
+        Args:
+            x: (B, C, T, H, W) tensor of spatial inputs (images or latent representations of images)
+            sigma: (B,) tensor of noise standard deviations
+            y_feat: List((B, L, y_feat_dim) tensor of caption token features. For SDXL text encoders: L=77, y_feat_dim=2048)
+            y_mask: List((B, L) boolean tensor indicating which tokens are not padding)
+            packed_indices: Dict with keys for Flash Attention. Result of compute_packed_indices.
+        """
+        B, _, T, H, W = x.shape
+
+        x, c, y_feat, rope_cos, rope_sin = self.prepare(
+            x, sigma, y_feat, y_mask
+        )
+        del y_mask
+
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"], out["txt"] = block(
+                                                    args["img"],
+                                                    args["vec"],
+                                                    args["txt"],
+                                                    rope_cos=args["rope_cos"],
+                                                    rope_sin=args["rope_sin"],
+                                                    crop_y=args["num_tokens"]
+                                                    )
+                    return out
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens}, {"original_block": block_wrap})
+                y_feat = out["txt"]
+                x = out["img"]
+            else:
+                x, y_feat = block(
+                    x,
+                    c,
+                    y_feat,
+                    rope_cos=rope_cos,
+                    rope_sin=rope_sin,
+                    crop_y=num_tokens,
+                )  # (B, M, D), (B, L, D)
+        del y_feat  # Final layers don't use dense text features.
+
+        x = self.final_layer(x, c)  # (B, M, patch_size ** 2 * out_channels)
+        x = rearrange(
+            x,
+            "B (T hp wp) (p1 p2 c) -> B c T (hp p1) (wp p2)",
+            T=T,
+            hp=H // self.patch_size,
+            wp=W // self.patch_size,
+            p1=self.patch_size,
+            p2=self.patch_size,
+            c=self.out_channels,
+        )
+
+        return -x
--- a/comfy/ldm/genmo/joint_model/layers.py
+++ b/comfy/ldm/genmo/joint_model/layers.py
@@ -0,0 +1,164 @@
+#original code from https://github.com/genmoai/models under apache 2.0 license
+#adapted to ComfyUI
+
+import collections.abc
+import math
+from itertools import repeat
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+import comfy.ldm.common_dit
+
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+
+class TimestepEmbedder(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        frequency_embedding_size: int = 256,
+        *,
+        bias: bool = True,
+        timestep_scale: Optional[float] = None,
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            operations.Linear(frequency_embedding_size, hidden_size, bias=bias, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Linear(hidden_size, hidden_size, bias=bias, dtype=dtype, device=device),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.timestep_scale = timestep_scale
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        half = dim // 2
+        freqs = torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
+        freqs.mul_(-math.log(max_period) / half).exp_()
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+
+    def forward(self, t, out_dtype):
+        if self.timestep_scale is not None:
+            t = t * self.timestep_scale
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype=out_dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_size: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        device: Optional[torch.device] = None,
+        dtype=None,
+        operations=None,
+    ):
+        super().__init__()
+        # keep parameter count and computation constant compared to standard FFN
+        hidden_size = int(2 * hidden_size / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_size = int(ffn_dim_multiplier * hidden_size)
+        hidden_size = multiple_of * ((hidden_size + multiple_of - 1) // multiple_of)
+
+        self.hidden_dim = hidden_size
+        self.w1 = operations.Linear(in_features, 2 * hidden_size, bias=False, device=device, dtype=dtype)
+        self.w2 = operations.Linear(hidden_size, in_features, bias=False, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x, gate = self.w1(x).chunk(2, dim=-1)
+        x = self.w2(F.silu(x) * gate)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = True,
+        bias: bool = True,
+        dynamic_img_pad: bool = False,
+        dtype=None,
+        device=None,
+        operations=None,
+    ):
+        super().__init__()
+        self.patch_size = to_2tuple(patch_size)
+        self.flatten = flatten
+        self.dynamic_img_pad = dynamic_img_pad
+
+        self.proj = operations.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=bias,
+            device=device,
+            dtype=dtype,
+        )
+        assert norm_layer is None
+        self.norm = (
+            norm_layer(embed_dim, device=device) if norm_layer else nn.Identity()
+        )
+
+    def forward(self, x):
+        B, _C, T, H, W = x.shape
+        if not self.dynamic_img_pad:
+            assert H % self.patch_size[0] == 0, f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]})."
+            assert W % self.patch_size[1] == 0, f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
+        else:
+            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
+            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
+            x = F.pad(x, (0, pad_w, 0, pad_h))
+
+        x = rearrange(x, "B C T H W -> (B T) C H W", B=B, T=T)
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size, padding_mode='circular')
+        x = self.proj(x)
+
+        # Flatten temporal and spatial dimensions.
+        if not self.flatten:
+            raise NotImplementedError("Must flatten output.")
+        x = rearrange(x, "(B T) C H W -> B (T H W) C", B=B, T=T)
+
+        x = self.norm(x)
+        return x
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, device=device, dtype=dtype))
+        self.register_parameter("bias", None)
+
+    def forward(self, x):
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
--- a/comfy/ldm/genmo/joint_model/rope_mixed.py
+++ b/comfy/ldm/genmo/joint_model/rope_mixed.py
@@ -0,0 +1,88 @@
+#original code from https://github.com/genmoai/models under apache 2.0 license
+
+# import functools
+import math
+
+import torch
+
+
+def centers(start: float, stop, num, dtype=None, device=None):
+    """linspace through bin centers.
+
+    Args:
+        start (float): Start of the range.
+        stop (float): End of the range.
+        num (int): Number of points.
+        dtype (torch.dtype): Data type of the points.
+        device (torch.device): Device of the points.
+
+    Returns:
+        centers (Tensor): Centers of the bins. Shape: (num,).
+    """
+    edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)
+    return (edges[:-1] + edges[1:]) / 2
+
+
+# @functools.lru_cache(maxsize=1)
+def create_position_matrix(
+    T: int,
+    pH: int,
+    pW: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    *,
+    target_area: float = 36864,
+):
+    """
+    Args:
+        T: int - Temporal dimension
+        pH: int - Height dimension after patchify
+        pW: int - Width dimension after patchify
+
+    Returns:
+        pos: [T * pH * pW, 3] - position matrix
+    """
+    # Create 1D tensors for each dimension
+    t = torch.arange(T, dtype=dtype)
+
+    # Positionally interpolate to area 36864.
+    # (3072x3072 frame with 16x16 patches = 192x192 latents).
+    # This automatically scales rope positions when the resolution changes.
+    # We use a large target area so the model is more sensitive
+    # to changes in the learned pos_frequencies matrix.
+    scale = math.sqrt(target_area / (pW * pH))
+    w = centers(-pW * scale / 2, pW * scale / 2, pW)
+    h = centers(-pH * scale / 2, pH * scale / 2, pH)
+
+    # Use meshgrid to create 3D grids
+    grid_t, grid_h, grid_w = torch.meshgrid(t, h, w, indexing="ij")
+
+    # Stack and reshape the grids.
+    pos = torch.stack([grid_t, grid_h, grid_w], dim=-1)  # [T, pH, pW, 3]
+    pos = pos.view(-1, 3)  # [T * pH * pW, 3]
+    pos = pos.to(dtype=dtype, device=device)
+
+    return pos
+
+
+def compute_mixed_rotation(
+    freqs: torch.Tensor,
+    pos: torch.Tensor,
+):
+    """
+    Project each 3-dim position into per-head, per-head-dim 1D frequencies.
+
+    Args:
+        freqs: [3, num_heads, num_freqs] - learned rotation frequency (for t, row, col) for each head position
+        pos: [N, 3] - position of each token
+        num_heads: int
+
+    Returns:
+        freqs_cos: [N, num_heads, num_freqs] - cosine components
+        freqs_sin: [N, num_heads, num_freqs] - sine components
+    """
+    assert freqs.ndim == 3
+    freqs_sum = torch.einsum("Nd,dhf->Nhf", pos.to(freqs), freqs)
+    freqs_cos = torch.cos(freqs_sum)
+    freqs_sin = torch.sin(freqs_sum)
+    return freqs_cos, freqs_sin
--- a/comfy/ldm/genmo/joint_model/temporal_rope.py
+++ b/comfy/ldm/genmo/joint_model/temporal_rope.py
@@ -0,0 +1,34 @@
+#original code from https://github.com/genmoai/models under apache 2.0 license
+
+# Based on Llama3 Implementation.
+import torch
+
+
+def apply_rotary_emb_qk_real(
+    xqk: torch.Tensor,
+    freqs_cos: torch.Tensor,
+    freqs_sin: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor without complex numbers.
+
+    Args:
+        xqk (torch.Tensor): Query and/or Key tensors to apply rotary embeddings. Shape: (B, S, *, num_heads, D)
+                            Can be either just query or just key, or both stacked along some batch or * dim.
+        freqs_cos (torch.Tensor): Precomputed cosine frequency tensor.
+        freqs_sin (torch.Tensor): Precomputed sine frequency tensor.
+
+    Returns:
+        torch.Tensor: The input tensor with rotary embeddings applied.
+    """
+    # Split the last dimension into even and odd parts
+    xqk_even = xqk[..., 0::2]
+    xqk_odd = xqk[..., 1::2]
+
+    # Apply rotation
+    cos_part = (xqk_even * freqs_cos - xqk_odd * freqs_sin).type_as(xqk)
+    sin_part = (xqk_even * freqs_sin + xqk_odd * freqs_cos).type_as(xqk)
+
+    # Interleave the results back into the original shape
+    out = torch.stack([cos_part, sin_part], dim=-1).flatten(-2)
+    return out
--- a/comfy/ldm/genmo/joint_model/utils.py
+++ b/comfy/ldm/genmo/joint_model/utils.py
@@ -0,0 +1,102 @@
+#original code from https://github.com/genmoai/models under apache 2.0 license
+#adapted to ComfyUI
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def pool_tokens(x: torch.Tensor, mask: torch.Tensor, *, keepdim=False) -> torch.Tensor:
+    """
+    Pool tokens in x using mask.
+
+    NOTE: We assume x does not require gradients.
+
+    Args:
+        x: (B, L, D) tensor of tokens.
+        mask: (B, L) boolean tensor indicating which tokens are not padding.
+
+    Returns:
+        pooled: (B, D) tensor of pooled tokens.
+    """
+    assert x.size(1) == mask.size(1)  # Expected mask to have same length as tokens.
+    assert x.size(0) == mask.size(0)  # Expected mask to have same batch size as tokens.
+    mask = mask[:, :, None].to(dtype=x.dtype)
+    mask = mask / mask.sum(dim=1, keepdim=True).clamp(min=1)
+    pooled = (x * mask).sum(dim=1, keepdim=keepdim)
+    return pooled
+
+
+class AttentionPool(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        output_dim: int = None,
+        device: Optional[torch.device] = None,
+        dtype=None,
+        operations=None,
+    ):
+        """
+        Args:
+            spatial_dim (int): Number of tokens in sequence length.
+            embed_dim (int): Dimensionality of input tokens.
+            num_heads (int): Number of attention heads.
+            output_dim (int): Dimensionality of output tokens. Defaults to embed_dim.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        self.to_kv = operations.Linear(embed_dim, 2 * embed_dim, device=device, dtype=dtype)
+        self.to_q = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
+        self.to_out = operations.Linear(embed_dim, output_dim or embed_dim, device=device, dtype=dtype)
+
+    def forward(self, x, mask):
+        """
+        Args:
+            x (torch.Tensor): (B, L, D) tensor of input tokens.
+            mask (torch.Tensor): (B, L) boolean tensor indicating which tokens are not padding.
+
+        NOTE: We assume x does not require gradients.
+
+        Returns:
+            x (torch.Tensor): (B, D) tensor of pooled tokens.
+        """
+        D = x.size(2)
+
+        # Construct attention mask, shape: (B, 1, num_queries=1, num_keys=1+L).
+        attn_mask = mask[:, None, None, :].bool()  # (B, 1, 1, L).
+        attn_mask = F.pad(attn_mask, (1, 0), value=True)  # (B, 1, 1, 1+L).
+
+        # Average non-padding token features. These will be used as the query.
+        x_pool = pool_tokens(x, mask, keepdim=True)  # (B, 1, D)
+
+        # Concat pooled features to input sequence.
+        x = torch.cat([x_pool, x], dim=1)  # (B, L+1, D)
+
+        # Compute queries, keys, values. Only the mean token is used to create a query.
+        kv = self.to_kv(x)  # (B, L+1, 2 * D)
+        q = self.to_q(x[:, 0])  # (B, D)
+
+        # Extract heads.
+        head_dim = D // self.num_heads
+        kv = kv.unflatten(2, (2, self.num_heads, head_dim))  # (B, 1+L, 2, H, head_dim)
+        kv = kv.transpose(1, 3)  # (B, H, 2, 1+L, head_dim)
+        k, v = kv.unbind(2)  # (B, H, 1+L, head_dim)
+        q = q.unflatten(1, (self.num_heads, head_dim))  # (B, H, head_dim)
+        q = q.unsqueeze(2)  # (B, H, 1, head_dim)
+
+        # Compute attention.
+        x = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=0.0
+        )  # (B, H, 1, head_dim)
+
+        # Concatenate heads and run output.
+        x = x.squeeze(2).flatten(1, 2)  # (B, D = H * head_dim)
+        x = self.to_out(x)
+        return x
--- a/comfy/ldm/genmo/vae/model.py
+++ b/comfy/ldm/genmo/vae/model.py
@@ -0,0 +1,711 @@
+#original code from https://github.com/genmoai/models under apache 2.0 license
+#adapted to ComfyUI
+
+from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from comfy.ldm.modules.attention import optimized_attention
+
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+# import mochi_preview.dit.joint_model.context_parallel as cp
+# from mochi_preview.vae.cp_conv import cp_pass_frames, gather_all_frames
+
+
+def cast_tuple(t, length=1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+
+
+class GroupNormSpatial(ops.GroupNorm):
+    """
+    GroupNorm applied per-frame.
+    """
+
+    def forward(self, x: torch.Tensor, *, chunk_size: int = 8):
+        B, C, T, H, W = x.shape
+        x = rearrange(x, "B C T H W -> (B T) C H W")
+        # Run group norm in chunks.
+        output = torch.empty_like(x)
+        for b in range(0, B * T, chunk_size):
+            output[b : b + chunk_size] = super().forward(x[b : b + chunk_size])
+        return rearrange(output, "(B T) C H W -> B C T H W", B=B, T=T)
+
+class PConv3d(ops.Conv3d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]],
+        causal: bool = True,
+        context_parallel: bool = True,
+        **kwargs,
+    ):
+        self.causal = causal
+        self.context_parallel = context_parallel
+        kernel_size = cast_tuple(kernel_size, 3)
+        stride = cast_tuple(stride, 3)
+        height_pad = (kernel_size[1] - 1) // 2
+        width_pad = (kernel_size[2] - 1) // 2
+
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=(1, 1, 1),
+            padding=(0, height_pad, width_pad),
+            **kwargs,
+        )
+
+    def forward(self, x: torch.Tensor):
+        # Compute padding amounts.
+        context_size = self.kernel_size[0] - 1
+        if self.causal:
+            pad_front = context_size
+            pad_back = 0
+        else:
+            pad_front = context_size // 2
+            pad_back = context_size - pad_front
+
+        # Apply padding.
+        assert self.padding_mode == "replicate"  # DEBUG
+        mode = "constant" if self.padding_mode == "zeros" else self.padding_mode
+        x = F.pad(x, (0, 0, 0, 0, pad_front, pad_back), mode=mode)
+        return super().forward(x)
+
+
+class Conv1x1(ops.Linear):
+    """*1x1 Conv implemented with a linear layer."""
+
+    def __init__(self, in_features: int, out_features: int, *args, **kwargs):
+        super().__init__(in_features, out_features, *args, **kwargs)
+
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+
+        Args:
+            x: Input tensor. Shape: [B, C, *] or [B, *, C].
+
+        Returns:
+            x: Output tensor. Shape: [B, C', *] or [B, *, C'].
+        """
+        x = x.movedim(1, -1)
+        x = super().forward(x)
+        x = x.movedim(-1, 1)
+        return x
+
+
+class DepthToSpaceTime(nn.Module):
+    def __init__(
+        self,
+        temporal_expansion: int,
+        spatial_expansion: int,
+    ):
+        super().__init__()
+        self.temporal_expansion = temporal_expansion
+        self.spatial_expansion = spatial_expansion
+
+    # When printed, this module should show the temporal and spatial expansion factors.
+    def extra_repr(self):
+        return f"texp={self.temporal_expansion}, sexp={self.spatial_expansion}"
+
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+
+        Returns:
+            x: Rearranged tensor. Shape: [B, C/(st*s*s), T*st, H*s, W*s].
+        """
+        x = rearrange(
+            x,
+            "B (C st sh sw) T H W -> B C (T st) (H sh) (W sw)",
+            st=self.temporal_expansion,
+            sh=self.spatial_expansion,
+            sw=self.spatial_expansion,
+        )
+
+        # cp_rank, _ = cp.get_cp_rank_size()
+        if self.temporal_expansion > 1: # and cp_rank == 0:
+            # Drop the first self.temporal_expansion - 1 frames.
+            # This is because we always want the 3x3x3 conv filter to only apply
+            # to the first frame, and the first frame doesn't need to be repeated.
+            assert all(x.shape)
+            x = x[:, :, self.temporal_expansion - 1 :]
+            assert all(x.shape)
+
+        return x
+
+
+def norm_fn(
+    in_channels: int,
+    affine: bool = True,
+):
+    return GroupNormSpatial(affine=affine, num_groups=32, num_channels=in_channels)
+
+
+class ResBlock(nn.Module):
+    """Residual block that preserves the spatial dimensions."""
+
+    def __init__(
+        self,
+        channels: int,
+        *,
+        affine: bool = True,
+        attn_block: Optional[nn.Module] = None,
+        causal: bool = True,
+        prune_bottleneck: bool = False,
+        padding_mode: str,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.channels = channels
+
+        assert causal
+        self.stack = nn.Sequential(
+            norm_fn(channels, affine=affine),
+            nn.SiLU(inplace=True),
+            PConv3d(
+                in_channels=channels,
+                out_channels=channels // 2 if prune_bottleneck else channels,
+                kernel_size=(3, 3, 3),
+                stride=(1, 1, 1),
+                padding_mode=padding_mode,
+                bias=bias,
+                causal=causal,
+            ),
+            norm_fn(channels, affine=affine),
+            nn.SiLU(inplace=True),
+            PConv3d(
+                in_channels=channels // 2 if prune_bottleneck else channels,
+                out_channels=channels,
+                kernel_size=(3, 3, 3),
+                stride=(1, 1, 1),
+                padding_mode=padding_mode,
+                bias=bias,
+                causal=causal,
+            ),
+        )
+
+        self.attn_block = attn_block if attn_block else nn.Identity()
+
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+        """
+        residual = x
+        x = self.stack(x)
+        x = x + residual
+        del residual
+
+        return self.attn_block(x)
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int = 32,
+        qkv_bias: bool = False,
+        out_bias: bool = True,
+        qk_norm: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = head_dim
+        self.num_heads = dim // head_dim
+        self.qk_norm = qk_norm
+
+        self.qkv = nn.Linear(dim, 3 * dim, bias=qkv_bias)
+        self.out = nn.Linear(dim, dim, bias=out_bias)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute temporal self-attention.
+
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+            chunk_size: Chunk size for large tensors.
+
+        Returns:
+            x: Output tensor. Shape: [B, C, T, H, W].
+        """
+        B, _, T, H, W = x.shape
+
+        if T == 1:
+            # No attention for single frame.
+            x = x.movedim(1, -1)  # [B, C, T, H, W] -> [B, T, H, W, C]
+            qkv = self.qkv(x)
+            _, _, x = qkv.chunk(3, dim=-1)  # Throw away queries and keys.
+            x = self.out(x)
+            return x.movedim(-1, 1)  # [B, T, H, W, C] -> [B, C, T, H, W]
+
+        # 1D temporal attention.
+        x = rearrange(x, "B C t h w -> (B h w) t C")
+        qkv = self.qkv(x)
+
+        # Input: qkv with shape [B, t, 3 * num_heads * head_dim]
+        # Output: x with shape [B, num_heads, t, head_dim]
+        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, self.head_dim).transpose(1, 3).unbind(2)
+
+        if self.qk_norm:
+            q = F.normalize(q, p=2, dim=-1)
+            k = F.normalize(k, p=2, dim=-1)
+
+        x = optimized_attention(q, k, v, self.num_heads, skip_reshape=True)
+
+        assert x.size(0) == q.size(0)
+
+        x = self.out(x)
+        x = rearrange(x, "(B h w) t C -> B C t h w", B=B, h=H, w=W)
+        return x
+
+
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        **attn_kwargs,
+    ) -> None:
+        super().__init__()
+        self.norm = norm_fn(dim)
+        self.attn = Attention(dim, **attn_kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.attn(self.norm(x))
+
+
+class CausalUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks: int,
+        *,
+        temporal_expansion: int = 2,
+        spatial_expansion: int = 2,
+        **block_kwargs,
+    ):
+        super().__init__()
+
+        blocks = []
+        for _ in range(num_res_blocks):
+            blocks.append(block_fn(in_channels, **block_kwargs))
+        self.blocks = nn.Sequential(*blocks)
+
+        self.temporal_expansion = temporal_expansion
+        self.spatial_expansion = spatial_expansion
+
+        # Change channels in the final convolution layer.
+        self.proj = Conv1x1(
+            in_channels,
+            out_channels * temporal_expansion * (spatial_expansion**2),
+        )
+
+        self.d2st = DepthToSpaceTime(
+            temporal_expansion=temporal_expansion, spatial_expansion=spatial_expansion
+        )
+
+    def forward(self, x):
+        x = self.blocks(x)
+        x = self.proj(x)
+        x = self.d2st(x)
+        return x
+
+
+def block_fn(channels, *, affine: bool = True, has_attention: bool = False, **block_kwargs):
+    attn_block = AttentionBlock(channels) if has_attention else None
+    return ResBlock(channels, affine=affine, attn_block=attn_block, **block_kwargs)
+
+
+class DownsampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks,
+        *,
+        temporal_reduction=2,
+        spatial_reduction=2,
+        **block_kwargs,
+    ):
+        """
+        Downsample block for the VAE encoder.
+
+        Args:
+            in_channels: Number of input channels.
+            out_channels: Number of output channels.
+            num_res_blocks: Number of residual blocks.
+            temporal_reduction: Temporal reduction factor.
+            spatial_reduction: Spatial reduction factor.
+        """
+        super().__init__()
+        layers = []
+
+        # Change the channel count in the strided convolution.
+        # This lets the ResBlock have uniform channel count,
+        # as in ConvNeXt.
+        assert in_channels != out_channels
+        layers.append(
+            PConv3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(temporal_reduction, spatial_reduction, spatial_reduction),
+                stride=(temporal_reduction, spatial_reduction, spatial_reduction),
+                # First layer in each block always uses replicate padding
+                padding_mode="replicate",
+                bias=block_kwargs["bias"],
+            )
+        )
+
+        for _ in range(num_res_blocks):
+            layers.append(block_fn(out_channels, **block_kwargs))
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+def add_fourier_features(inputs: torch.Tensor, start=6, stop=8, step=1):
+    num_freqs = (stop - start) // step
+    assert inputs.ndim == 5
+    C = inputs.size(1)
+
+    # Create Base 2 Fourier features.
+    freqs = torch.arange(start, stop, step, dtype=inputs.dtype, device=inputs.device)
+    assert num_freqs == len(freqs)
+    w = torch.pow(2.0, freqs) * (2 * torch.pi)  # [num_freqs]
+    C = inputs.shape[1]
+    w = w.repeat(C)[None, :, None, None, None]  # [1, C * num_freqs, 1, 1, 1]
+
+    # Interleaved repeat of input channels to match w.
+    h = inputs.repeat_interleave(num_freqs, dim=1)  # [B, C * num_freqs, T, H, W]
+    # Scale channels by frequency.
+    h = w * h
+
+    return torch.cat(
+        [
+            inputs,
+            torch.sin(h),
+            torch.cos(h),
+        ],
+        dim=1,
+    )
+
+
+class FourierFeatures(nn.Module):
+    def __init__(self, start: int = 6, stop: int = 8, step: int = 1):
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+    def forward(self, inputs):
+        """Add Fourier features to inputs.
+
+        Args:
+            inputs: Input tensor. Shape: [B, C, T, H, W]
+
+        Returns:
+            h: Output tensor. Shape: [B, (1 + 2 * num_freqs) * C, T, H, W]
+        """
+        return add_fourier_features(inputs, self.start, self.stop, self.step)
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        out_channels: int = 3,
+        latent_dim: int,
+        base_channels: int,
+        channel_multipliers: List[int],
+        num_res_blocks: List[int],
+        temporal_expansions: Optional[List[int]] = None,
+        spatial_expansions: Optional[List[int]] = None,
+        has_attention: List[bool],
+        output_norm: bool = True,
+        nonlinearity: str = "silu",
+        output_nonlinearity: str = "silu",
+        causal: bool = True,
+        **block_kwargs,
+    ):
+        super().__init__()
+        self.input_channels = latent_dim
+        self.base_channels = base_channels
+        self.channel_multipliers = channel_multipliers
+        self.num_res_blocks = num_res_blocks
+        self.output_nonlinearity = output_nonlinearity
+        assert nonlinearity == "silu"
+        assert causal
+
+        ch = [mult * base_channels for mult in channel_multipliers]
+        self.num_up_blocks = len(ch) - 1
+        assert len(num_res_blocks) == self.num_up_blocks + 2
+
+        blocks = []
+
+        first_block = [
+            ops.Conv3d(latent_dim, ch[-1], kernel_size=(1, 1, 1))
+        ]  # Input layer.
+        # First set of blocks preserve channel count.
+        for _ in range(num_res_blocks[-1]):
+            first_block.append(
+                block_fn(
+                    ch[-1],
+                    has_attention=has_attention[-1],
+                    causal=causal,
+                    **block_kwargs,
+                )
+            )
+        blocks.append(nn.Sequential(*first_block))
+
+        assert len(temporal_expansions) == len(spatial_expansions) == self.num_up_blocks
+        assert len(num_res_blocks) == len(has_attention) == self.num_up_blocks + 2
+
+        upsample_block_fn = CausalUpsampleBlock
+
+        for i in range(self.num_up_blocks):
+            block = upsample_block_fn(
+                ch[-i - 1],
+                ch[-i - 2],
+                num_res_blocks=num_res_blocks[-i - 2],
+                has_attention=has_attention[-i - 2],
+                temporal_expansion=temporal_expansions[-i - 1],
+                spatial_expansion=spatial_expansions[-i - 1],
+                causal=causal,
+                **block_kwargs,
+            )
+            blocks.append(block)
+
+        assert not output_norm
+
+        # Last block. Preserve channel count.
+        last_block = []
+        for _ in range(num_res_blocks[0]):
+            last_block.append(
+                block_fn(
+                    ch[0], has_attention=has_attention[0], causal=causal, **block_kwargs
+                )
+            )
+        blocks.append(nn.Sequential(*last_block))
+
+        self.blocks = nn.ModuleList(blocks)
+        self.output_proj = Conv1x1(ch[0], out_channels)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x: Latent tensor. Shape: [B, input_channels, t, h, w]. Scaled [-1, 1].
+
+        Returns:
+            x: Reconstructed video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1].
+               T + 1 = (t - 1) * 4.
+               H = h * 16, W = w * 16.
+        """
+        for block in self.blocks:
+            x = block(x)
+
+        if self.output_nonlinearity == "silu":
+            x = F.silu(x, inplace=not self.training)
+        else:
+            assert (
+                not self.output_nonlinearity
+            )  # StyleGAN3 omits the to-RGB nonlinearity.
+
+        return self.output_proj(x).contiguous()
+
+class LatentDistribution:
+    def __init__(self, mean: torch.Tensor, logvar: torch.Tensor):
+        """Initialize latent distribution.
+
+        Args:
+            mean: Mean of the distribution. Shape: [B, C, T, H, W].
+            logvar: Logarithm of variance of the distribution. Shape: [B, C, T, H, W].
+        """
+        assert mean.shape == logvar.shape
+        self.mean = mean
+        self.logvar = logvar
+
+    def sample(self, temperature=1.0, generator: torch.Generator = None, noise=None):
+        if temperature == 0.0:
+            return self.mean
+
+        if noise is None:
+            noise = torch.randn(self.mean.shape, device=self.mean.device, dtype=self.mean.dtype, generator=generator)
+        else:
+            assert noise.device == self.mean.device
+            noise = noise.to(self.mean.dtype)
+
+        if temperature != 1.0:
+            raise NotImplementedError(f"Temperature {temperature} is not supported.")
+
+        # Just Gaussian sample with no scaling of variance.
+        return noise * torch.exp(self.logvar * 0.5) + self.mean
+
+    def mode(self):
+        return self.mean
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        base_channels: int,
+        channel_multipliers: List[int],
+        num_res_blocks: List[int],
+        latent_dim: int,
+        temporal_reductions: List[int],
+        spatial_reductions: List[int],
+        prune_bottlenecks: List[bool],
+        has_attentions: List[bool],
+        affine: bool = True,
+        bias: bool = True,
+        input_is_conv_1x1: bool = False,
+        padding_mode: str,
+    ):
+        super().__init__()
+        self.temporal_reductions = temporal_reductions
+        self.spatial_reductions = spatial_reductions
+        self.base_channels = base_channels
+        self.channel_multipliers = channel_multipliers
+        self.num_res_blocks = num_res_blocks
+        self.latent_dim = latent_dim
+
+        self.fourier_features = FourierFeatures()
+        ch = [mult * base_channels for mult in channel_multipliers]
+        num_down_blocks = len(ch) - 1
+        assert len(num_res_blocks) == num_down_blocks + 2
+
+        layers = (
+            [ops.Conv3d(in_channels, ch[0], kernel_size=(1, 1, 1), bias=True)]
+            if not input_is_conv_1x1
+            else [Conv1x1(in_channels, ch[0])]
+        )
+
+        assert len(prune_bottlenecks) == num_down_blocks + 2
+        assert len(has_attentions) == num_down_blocks + 2
+        block = partial(block_fn, padding_mode=padding_mode, affine=affine, bias=bias)
+
+        for _ in range(num_res_blocks[0]):
+            layers.append(block(ch[0], has_attention=has_attentions[0], prune_bottleneck=prune_bottlenecks[0]))
+        prune_bottlenecks = prune_bottlenecks[1:]
+        has_attentions = has_attentions[1:]
+
+        assert len(temporal_reductions) == len(spatial_reductions) == len(ch) - 1
+        for i in range(num_down_blocks):
+            layer = DownsampleBlock(
+                ch[i],
+                ch[i + 1],
+                num_res_blocks=num_res_blocks[i + 1],
+                temporal_reduction=temporal_reductions[i],
+                spatial_reduction=spatial_reductions[i],
+                prune_bottleneck=prune_bottlenecks[i],
+                has_attention=has_attentions[i],
+                affine=affine,
+                bias=bias,
+                padding_mode=padding_mode,
+            )
+
+            layers.append(layer)
+
+        # Additional blocks.
+        for _ in range(num_res_blocks[-1]):
+            layers.append(block(ch[-1], has_attention=has_attentions[-1], prune_bottleneck=prune_bottlenecks[-1]))
+
+        self.layers = nn.Sequential(*layers)
+
+        # Output layers.
+        self.output_norm = norm_fn(ch[-1])
+        self.output_proj = Conv1x1(ch[-1], 2 * latent_dim, bias=False)
+
+    @property
+    def temporal_downsample(self):
+        return math.prod(self.temporal_reductions)
+
+    @property
+    def spatial_downsample(self):
+        return math.prod(self.spatial_reductions)
+
+    def forward(self, x) -> LatentDistribution:
+        """Forward pass.
+
+        Args:
+            x: Input video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1]
+
+        Returns:
+            means: Latent tensor. Shape: [B, latent_dim, t, h, w]. Scaled [-1, 1].
+                   h = H // 8, w = W // 8, t - 1 = (T - 1) // 6
+            logvar: Shape: [B, latent_dim, t, h, w].
+        """
+        assert x.ndim == 5, f"Expected 5D input, got {x.shape}"
+        x = self.fourier_features(x)
+
+        x = self.layers(x)
+
+        x = self.output_norm(x)
+        x = F.silu(x, inplace=True)
+        x = self.output_proj(x)
+
+        means, logvar = torch.chunk(x, 2, dim=1)
+
+        assert means.ndim == 5
+        assert logvar.shape == means.shape
+        assert means.size(1) == self.latent_dim
+
+        return LatentDistribution(means, logvar)
+
+
+class VideoVAE(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.encoder = Encoder(
+            in_channels=15,
+            base_channels=64,
+            channel_multipliers=[1, 2, 4, 6],
+            num_res_blocks=[3, 3, 4, 6, 3],
+            latent_dim=12,
+            temporal_reductions=[1, 2, 3],
+            spatial_reductions=[2, 2, 2],
+            prune_bottlenecks=[False, False, False, False, False],
+            has_attentions=[False, True, True, True, True],
+            affine=True,
+            bias=True,
+            input_is_conv_1x1=True,
+            padding_mode="replicate"
+        )
+        self.decoder = Decoder(
+            out_channels=3,
+            base_channels=128,
+            channel_multipliers=[1, 2, 4, 6],
+            temporal_expansions=[1, 2, 3],
+            spatial_expansions=[2, 2, 2],
+            num_res_blocks=[3, 3, 4, 6, 3],
+            latent_dim=12,
+            has_attention=[False, False, False, False, False],
+            padding_mode="replicate",
+            output_norm=False,
+            nonlinearity="silu",
+            output_nonlinearity="silu",
+            causal=True,
+        )
+
+    def encode(self, x):
+        return self.encoder(x).mode()
+
+    def decode(self, x):
+        return self.decoder(x)
--- a/comfy/ldm/hydit/attn_layers.py
+++ b/comfy/ldm/hydit/attn_layers.py
@@ -0,0 +1,218 @@
+import torch
+import torch.nn as nn
+from typing import Tuple, Union, Optional
+from comfy.ldm.modules.attention import optimized_attention
+
+
+def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x: torch.Tensor, head_first=False):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis[0].shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis.shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+
+
+def rotate_half(x):
+    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+
+def apply_rotary_emb(
+        xq: torch.Tensor,
+        xk: Optional[torch.Tensor],
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+        head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Precomputed frequency tensor for complex exponentials.
+        head_first (bool): head dimension first (except batch dim) or not.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)    # [S, D]
+        xq_out = (xq * cos + rotate_half(xq) * sin)
+        if xk is not None:
+            xk_out = (xk * cos + rotate_half(xk) * sin)
+    else:
+        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)   # [S, D//2] --> [1, S, 1, D//2]
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        if xk is not None:
+            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # [B, S, H, D//2]
+            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+
+    return xq_out, xk_out
+
+
+
+class CrossAttention(nn.Module):
+    """
+    Use QK Normalization.
+    """
+    def __init__(self,
+                 qdim,
+                 kdim,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_norm=False,
+                 attn_drop=0.0,
+                 proj_drop=0.0,
+                 attn_precision=None,
+                 device=None,
+                 dtype=None,
+                 operations=None,
+                 ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.attn_precision = attn_precision
+        self.qdim = qdim
+        self.kdim = kdim
+        self.num_heads = num_heads
+        assert self.qdim % num_heads == 0, "self.qdim must be divisible by num_heads"
+        self.head_dim = self.qdim // num_heads
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+        self.scale = self.head_dim ** -0.5
+
+        self.q_proj = operations.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.kv_proj = operations.Linear(kdim, 2 * qdim, bias=qkv_bias, **factory_kwargs)
+
+        # TODO: eps should be 1 / 65530 if using fp16
+        self.q_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
+        self.k_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.out_proj = operations.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, y, freqs_cis_img=None):
+        """
+        Parameters
+        ----------
+        x: torch.Tensor
+            (batch, seqlen1, hidden_dim) (where hidden_dim = num heads * head dim)
+        y: torch.Tensor
+            (batch, seqlen2, hidden_dim2)
+        freqs_cis_img: torch.Tensor
+            (batch, hidden_dim // 2), RoPE for image
+        """
+        b, s1, c = x.shape     # [b, s1, D]
+        _, s2, c = y.shape     # [b, s2, 1024]
+
+        q = self.q_proj(x).view(b, s1, self.num_heads, self.head_dim)   # [b, s1, h, d]
+        kv = self.kv_proj(y).view(b, s2, 2, self.num_heads, self.head_dim)    # [b, s2, 2, h, d]
+        k, v = kv.unbind(dim=2) # [b, s, h, d]
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        # Apply RoPE if needed
+        if freqs_cis_img is not None:
+            qq, _ = apply_rotary_emb(q, None, freqs_cis_img)
+            assert qq.shape == q.shape, f'qq: {qq.shape}, q: {q.shape}'
+            q = qq
+
+        q = q.transpose(-2, -3).contiguous()        # q ->  B, L1, H, C - B, H, L1, C
+        k = k.transpose(-2, -3).contiguous()      # k ->  B, L2, H, C - B, H, C, L2
+        v = v.transpose(-2, -3).contiguous() 
+
+        context = optimized_attention(q, k, v, self.num_heads, skip_reshape=True, attn_precision=self.attn_precision)
+
+        out = self.out_proj(context)  # context.reshape - B, L1, -1
+        out = self.proj_drop(out)
+
+        out_tuple = (out,)
+
+        return out_tuple
+
+
+class Attention(nn.Module):
+    """
+    We rename some layer names to align with flash attention
+    """
+    def __init__(self, dim, num_heads, qkv_bias=True, qk_norm=False, attn_drop=0., proj_drop=0., attn_precision=None, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.attn_precision = attn_precision
+        self.dim = dim
+        self.num_heads = num_heads
+        assert self.dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.head_dim = self.dim // num_heads
+        # This assertion is aligned with flash attention
+        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
+        self.scale = self.head_dim ** -0.5
+
+        # qkv --> Wqkv
+        self.Wqkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+        # TODO: eps should be 1 / 65530 if using fp16
+        self.q_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
+        self.k_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.out_proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, freqs_cis_img=None):
+        B, N, C = x.shape
+        qkv = self.Wqkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)   # [3, b, h, s, d]
+        q, k, v = qkv.unbind(0)     # [b, h, s, d]
+        q = self.q_norm(q)          # [b, h, s, d]
+        k = self.k_norm(k)          # [b, h, s, d]
+
+        # Apply RoPE if needed
+        if freqs_cis_img is not None:
+            qq, kk = apply_rotary_emb(q, k, freqs_cis_img, head_first=True)
+            assert qq.shape == q.shape and kk.shape == k.shape, \
+                f'qq: {qq.shape}, q: {q.shape}, kk: {kk.shape}, k: {k.shape}'
+            q, k = qq, kk
+
+        x = optimized_attention(q, k, v, self.num_heads, skip_reshape=True, attn_precision=self.attn_precision)
+        x = self.out_proj(x)
+        x = self.proj_drop(x)
+
+        out_tuple = (x,)
+
+        return out_tuple
--- a/comfy/ldm/hydit/controlnet.py
+++ b/comfy/ldm/hydit/controlnet.py
@@ -0,0 +1,321 @@
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.utils import checkpoint
+
+from comfy.ldm.modules.diffusionmodules.mmdit import (
+    Mlp,
+    TimestepEmbedder,
+    PatchEmbed,
+    RMSNorm,
+)
+from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
+from .poolers import AttentionPool
+
+import comfy.latent_formats
+from .models import HunYuanDiTBlock, calc_rope
+
+from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop
+
+
+class HunYuanControlNet(nn.Module):
+    """
+    HunYuanDiT: Diffusion model with a Transformer backbone.
+
+    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
+
+    Inherit PeftAdapterMixin to be compatible with the PEFT training pipeline.
+
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    input_size: tuple
+        The size of the input image.
+    patch_size: int
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    depth: int
+        The number of transformer blocks.
+    num_heads: int
+        The number of attention heads.
+    mlp_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    log_fn: callable
+        The logging function.
+    """
+
+    def __init__(
+        self,
+        input_size: tuple = 128,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        hidden_size: int = 1408,
+        depth: int = 40,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.3637,
+        text_states_dim=1024,
+        text_states_dim_t5=2048,
+        text_len=77,
+        text_len_t5=256,
+        qk_norm=True,  # See http://arxiv.org/abs/2302.05442 for details.
+        size_cond=False,
+        use_style_cond=False,
+        learn_sigma=True,
+        norm="layer",
+        log_fn: callable = print,
+        attn_precision=None,
+        dtype=None,
+        device=None,
+        operations=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.log_fn = log_fn
+        self.depth = depth
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.text_states_dim = text_states_dim
+        self.text_states_dim_t5 = text_states_dim_t5
+        self.text_len = text_len
+        self.text_len_t5 = text_len_t5
+        self.size_cond = size_cond
+        self.use_style_cond = use_style_cond
+        self.norm = norm
+        self.dtype = dtype
+        self.latent_format = comfy.latent_formats.SDXL
+
+        self.mlp_t5 = nn.Sequential(
+            nn.Linear(
+                self.text_states_dim_t5,
+                self.text_states_dim_t5 * 4,
+                bias=True,
+                dtype=dtype,
+                device=device,
+            ),
+            nn.SiLU(),
+            nn.Linear(
+                self.text_states_dim_t5 * 4,
+                self.text_states_dim,
+                bias=True,
+                dtype=dtype,
+                device=device,
+            ),
+        )
+        # learnable replace
+        self.text_embedding_padding = nn.Parameter(
+            torch.randn(
+                self.text_len + self.text_len_t5,
+                self.text_states_dim,
+                dtype=dtype,
+                device=device,
+            )
+        )
+
+        # Attention pooling
+        pooler_out_dim = 1024
+        self.pooler = AttentionPool(
+            self.text_len_t5,
+            self.text_states_dim_t5,
+            num_heads=8,
+            output_dim=pooler_out_dim,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        # Dimension of the extra input vectors
+        self.extra_in_dim = pooler_out_dim
+
+        if self.size_cond:
+            # Image size and crop size conditions
+            self.extra_in_dim += 6 * 256
+
+        if self.use_style_cond:
+            # Here we use a default learned embedder layer for future extension.
+            self.style_embedder = nn.Embedding(
+                1, hidden_size, dtype=dtype, device=device
+            )
+            self.extra_in_dim += hidden_size
+
+        # Text embedding for `add`
+        self.x_embedder = PatchEmbed(
+            input_size,
+            patch_size,
+            in_channels,
+            hidden_size,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.t_embedder = TimestepEmbedder(
+            hidden_size, dtype=dtype, device=device, operations=operations
+        )
+        self.extra_embedder = nn.Sequential(
+            operations.Linear(
+                self.extra_in_dim, hidden_size * 4, dtype=dtype, device=device
+            ),
+            nn.SiLU(),
+            operations.Linear(
+                hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device
+            ),
+        )
+
+        # Image embedding
+        num_patches = self.x_embedder.num_patches
+
+        # HUnYuanDiT Blocks
+        self.blocks = nn.ModuleList(
+            [
+                HunYuanDiTBlock(
+                    hidden_size=hidden_size,
+                    c_emb_size=hidden_size,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    text_states_dim=self.text_states_dim,
+                    qk_norm=qk_norm,
+                    norm_type=self.norm,
+                    skip=False,
+                    attn_precision=attn_precision,
+                    dtype=dtype,
+                    device=device,
+                    operations=operations,
+                )
+                for _ in range(19)
+            ]
+        )
+
+        # Input zero linear for the first block
+        self.before_proj = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
+
+
+        # Output zero linear for the every block
+        self.after_proj_list = nn.ModuleList(
+            [
+
+                    operations.Linear(
+                        self.hidden_size, self.hidden_size, dtype=dtype, device=device
+                    )
+                for _ in range(len(self.blocks))
+            ]
+        )
+
+    def forward(
+        self,
+        x,
+        hint,
+        timesteps,
+        context,#encoder_hidden_states=None,
+        text_embedding_mask=None,
+        encoder_hidden_states_t5=None,
+        text_embedding_mask_t5=None,
+        image_meta_size=None,
+        style=None,
+        return_dict=False,
+        **kwarg,
+    ):
+        """
+        Forward pass of the encoder.
+
+        Parameters
+        ----------
+        x: torch.Tensor
+            (B, D, H, W)
+        t: torch.Tensor
+            (B)
+        encoder_hidden_states: torch.Tensor
+            CLIP text embedding, (B, L_clip, D)
+        text_embedding_mask: torch.Tensor
+            CLIP text embedding mask, (B, L_clip)
+        encoder_hidden_states_t5: torch.Tensor
+            T5 text embedding, (B, L_t5, D)
+        text_embedding_mask_t5: torch.Tensor
+            T5 text embedding mask, (B, L_t5)
+        image_meta_size: torch.Tensor
+            (B, 6)
+        style: torch.Tensor
+            (B)
+        cos_cis_img: torch.Tensor
+        sin_cis_img: torch.Tensor
+        return_dict: bool
+            Whether to return a dictionary.
+        """
+        condition = hint
+        if condition.shape[0] == 1:
+            condition = torch.repeat_interleave(condition, x.shape[0], dim=0)
+
+        text_states = context  # 2,77,1024
+        text_states_t5 = encoder_hidden_states_t5  # 2,256,2048
+        text_states_mask = text_embedding_mask.bool()  # 2,77
+        text_states_t5_mask = text_embedding_mask_t5.bool()  # 2,256
+        b_t5, l_t5, c_t5 = text_states_t5.shape
+        text_states_t5 = self.mlp_t5(text_states_t5.view(-1, c_t5)).view(b_t5, l_t5, -1)
+
+        padding = comfy.ops.cast_to_input(self.text_embedding_padding, text_states)
+
+        text_states[:, -self.text_len :] = torch.where(
+            text_states_mask[:, -self.text_len :].unsqueeze(2),
+            text_states[:, -self.text_len :],
+            padding[: self.text_len],
+        )
+        text_states_t5[:, -self.text_len_t5 :] = torch.where(
+            text_states_t5_mask[:, -self.text_len_t5 :].unsqueeze(2),
+            text_states_t5[:, -self.text_len_t5 :],
+            padding[self.text_len :],
+        )
+
+        text_states = torch.cat([text_states, text_states_t5], dim=1)  # 2,205，1024
+
+        # _, _, oh, ow = x.shape
+        # th, tw = oh // self.patch_size, ow // self.patch_size
+
+        # Get image RoPE embedding according to `reso`lution.
+        freqs_cis_img = calc_rope(
+            x, self.patch_size, self.hidden_size // self.num_heads
+        )  # (cos_cis_img, sin_cis_img)
+
+        # ========================= Build time and image embedding =========================
+        t = self.t_embedder(timesteps, dtype=self.dtype)
+        x = self.x_embedder(x)
+
+        # ========================= Concatenate all extra vectors =========================
+        # Build text tokens with pooling
+        extra_vec = self.pooler(encoder_hidden_states_t5)
+
+        # Build image meta size tokens if applicable
+        # if image_meta_size is not None:
+        #     image_meta_size = timestep_embedding(image_meta_size.view(-1), 256)   # [B * 6, 256]
+        #     if image_meta_size.dtype != self.dtype:
+        #         image_meta_size = image_meta_size.half()
+        #     image_meta_size = image_meta_size.view(-1, 6 * 256)
+        #     extra_vec = torch.cat([extra_vec, image_meta_size], dim=1)  # [B, D + 6 * 256]
+
+        # Build style tokens
+        if style is not None:
+            style_embedding = self.style_embedder(style)
+            extra_vec = torch.cat([extra_vec, style_embedding], dim=1)
+
+        # Concatenate all extra vectors
+        c = t + self.extra_embedder(extra_vec)  # [B, D]
+
+        # ========================= Deal with Condition =========================
+        condition = self.x_embedder(condition)
+
+        # ========================= Forward pass through HunYuanDiT blocks =========================
+        controls = []
+        x = x + self.before_proj(condition)  # add condition
+        for layer, block in enumerate(self.blocks):
+            x = block(x, c, text_states, freqs_cis_img)
+            controls.append(self.after_proj_list[layer](x))  # zero linear for output
+
+        return {"output": controls}
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@@ -0,0 +1,410 @@
+from typing import Any
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
+from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
+from torch.utils import checkpoint
+
+from .attn_layers import Attention, CrossAttention
+from .poolers import AttentionPool
+from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop
+
+def calc_rope(x, patch_size, head_size):
+    th = (x.shape[2] + (patch_size // 2)) // patch_size
+    tw = (x.shape[3] + (patch_size // 2)) // patch_size
+    base_size = 512 // 8 // patch_size
+    start, stop = get_fill_resize_and_crop((th, tw), base_size)
+    sub_args = [start, stop, (th, tw)]
+    # head_size = HUNYUAN_DIT_CONFIG['DiT-g/2']['hidden_size'] // HUNYUAN_DIT_CONFIG['DiT-g/2']['num_heads']
+    rope = get_2d_rotary_pos_embed(head_size, *sub_args)
+    rope = (rope[0].to(x), rope[1].to(x))
+    return rope
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+class HunYuanDiTBlock(nn.Module):
+    """
+    A HunYuanDiT block with `add` conditioning.
+    """
+    def __init__(self,
+                 hidden_size,
+                 c_emb_size,
+                 num_heads,
+                 mlp_ratio=4.0,
+                 text_states_dim=1024,
+                 qk_norm=False,
+                 norm_type="layer",
+                 skip=False,
+                 attn_precision=None,
+                 dtype=None,
+                 device=None,
+                 operations=None,
+                 ):
+        super().__init__()
+        use_ele_affine = True
+
+        if norm_type == "layer":
+            norm_layer = operations.LayerNorm
+        elif norm_type == "rms":
+            norm_layer = RMSNorm
+        else:
+            raise ValueError(f"Unknown norm_type: {norm_type}")
+
+        # ========================= Self-Attention =========================
+        self.norm1 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6, dtype=dtype, device=device)
+        self.attn1 = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm, attn_precision=attn_precision, dtype=dtype, device=device, operations=operations)
+
+        # ========================= FFN =========================
+        self.norm2 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6, dtype=dtype, device=device)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0, dtype=dtype, device=device, operations=operations)
+
+        # ========================= Add =========================
+        # Simply use add like SDXL.
+        self.default_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(c_emb_size, hidden_size, bias=True, dtype=dtype, device=device)
+        )
+
+        # ========================= Cross-Attention =========================
+        self.attn2 = CrossAttention(hidden_size, text_states_dim, num_heads=num_heads, qkv_bias=True,
+                                        qk_norm=qk_norm, attn_precision=attn_precision, dtype=dtype, device=device, operations=operations)
+        self.norm3 = norm_layer(hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
+
+        # ========================= Skip Connection =========================
+        if skip:
+            self.skip_norm = norm_layer(2 * hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
+            self.skip_linear = operations.Linear(2 * hidden_size, hidden_size, dtype=dtype, device=device)
+        else:
+            self.skip_linear = None
+
+        self.gradient_checkpointing = False
+
+    def _forward(self, x, c=None, text_states=None, freq_cis_img=None, skip=None):
+        # Long Skip Connection
+        if self.skip_linear is not None:
+            cat = torch.cat([x, skip], dim=-1)
+            if cat.dtype != x.dtype:
+                cat = cat.to(x.dtype)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+
+        # Self-Attention
+        shift_msa = self.default_modulation(c).unsqueeze(dim=1)
+        attn_inputs = (
+            self.norm1(x) + shift_msa, freq_cis_img,
+        )
+        x = x + self.attn1(*attn_inputs)[0]
+
+        # Cross-Attention
+        cross_inputs = (
+            self.norm3(x), text_states, freq_cis_img
+        )
+        x = x + self.attn2(*cross_inputs)[0]
+
+        # FFN Layer
+        mlp_inputs = self.norm2(x)
+        x = x + self.mlp(mlp_inputs)
+
+        return x
+
+    def forward(self, x, c=None, text_states=None, freq_cis_img=None, skip=None):
+        if self.gradient_checkpointing and self.training:
+            return checkpoint.checkpoint(self._forward, x, c, text_states, freq_cis_img, skip)
+        return self._forward(x, c, text_states, freq_cis_img, skip)
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of HunYuanDiT.
+    """
+    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(c_emb_size, 2 * final_hidden_size, bias=True, dtype=dtype, device=device)
+        )
+
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class HunYuanDiT(nn.Module):
+    """
+    HunYuanDiT: Diffusion model with a Transformer backbone.
+
+    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
+
+    Inherit PeftAdapterMixin to be compatible with the PEFT training pipeline.
+
+    Parameters
+    ----------
+    args: argparse.Namespace
+        The arguments parsed by argparse.
+    input_size: tuple
+        The size of the input image.
+    patch_size: int
+        The size of the patch.
+    in_channels: int
+        The number of input channels.
+    hidden_size: int
+        The hidden size of the transformer backbone.
+    depth: int
+        The number of transformer blocks.
+    num_heads: int
+        The number of attention heads.
+    mlp_ratio: float
+        The ratio of the hidden size of the MLP in the transformer block.
+    log_fn: callable
+        The logging function.
+    """
+    #@register_to_config
+    def __init__(self,
+                 input_size: tuple = 32,
+                 patch_size: int = 2,
+                 in_channels: int = 4,
+                 hidden_size: int = 1152,
+                 depth: int = 28,
+                 num_heads: int = 16,
+                 mlp_ratio: float = 4.0,
+                 text_states_dim = 1024,
+                 text_states_dim_t5 = 2048,
+                 text_len = 77,
+                 text_len_t5 = 256,
+                 qk_norm = True,# See http://arxiv.org/abs/2302.05442 for details.
+                 size_cond = False,
+                 use_style_cond = False,
+                 learn_sigma = True,
+                 norm = "layer",
+                 log_fn: callable = print,
+                 attn_precision=None,
+                 dtype=None,
+                 device=None,
+                 operations=None,
+                 **kwargs,
+    ):
+        super().__init__()
+        self.log_fn = log_fn
+        self.depth = depth
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.text_states_dim = text_states_dim
+        self.text_states_dim_t5 = text_states_dim_t5
+        self.text_len = text_len
+        self.text_len_t5 = text_len_t5
+        self.size_cond = size_cond
+        self.use_style_cond = use_style_cond
+        self.norm = norm
+        self.dtype = dtype
+        #import pdb
+        #pdb.set_trace()
+
+        self.mlp_t5 = nn.Sequential(
+            operations.Linear(self.text_states_dim_t5, self.text_states_dim_t5 * 4, bias=True, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Linear(self.text_states_dim_t5 * 4, self.text_states_dim, bias=True, dtype=dtype, device=device),
+        )
+        # learnable replace
+        self.text_embedding_padding = nn.Parameter(
+            torch.empty(self.text_len + self.text_len_t5, self.text_states_dim, dtype=dtype, device=device))
+
+        # Attention pooling
+        pooler_out_dim = 1024
+        self.pooler = AttentionPool(self.text_len_t5, self.text_states_dim_t5, num_heads=8, output_dim=pooler_out_dim, dtype=dtype, device=device, operations=operations)
+
+        # Dimension of the extra input vectors
+        self.extra_in_dim = pooler_out_dim
+
+        if self.size_cond:
+            # Image size and crop size conditions
+            self.extra_in_dim += 6 * 256
+
+        if self.use_style_cond:
+            # Here we use a default learned embedder layer for future extension.
+            self.style_embedder = operations.Embedding(1, hidden_size, dtype=dtype, device=device)
+            self.extra_in_dim += hidden_size
+
+        # Text embedding for `add`
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, dtype=dtype, device=device, operations=operations)
+        self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype, device=device, operations=operations)
+        self.extra_embedder = nn.Sequential(
+            operations.Linear(self.extra_in_dim, hidden_size * 4, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Linear(hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+
+        # Image embedding
+        num_patches = self.x_embedder.num_patches
+
+        # HUnYuanDiT Blocks
+        self.blocks = nn.ModuleList([
+            HunYuanDiTBlock(hidden_size=hidden_size,
+                            c_emb_size=hidden_size,
+                            num_heads=num_heads,
+                            mlp_ratio=mlp_ratio,
+                            text_states_dim=self.text_states_dim,
+                            qk_norm=qk_norm,
+                            norm_type=self.norm,
+                            skip=layer > depth // 2,
+                            attn_precision=attn_precision,
+                            dtype=dtype,
+                            device=device,
+                            operations=operations,
+                            )
+            for layer in range(depth)
+        ])
+
+        self.final_layer = FinalLayer(hidden_size, hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
+        self.unpatchify_channels = self.out_channels
+
+
+
+    def forward(self,
+                x,
+                t,
+                context,#encoder_hidden_states=None,
+                text_embedding_mask=None,
+                encoder_hidden_states_t5=None,
+                text_embedding_mask_t5=None,
+                image_meta_size=None,
+                style=None,
+                return_dict=False,
+                control=None,
+                transformer_options=None,
+                ):
+        """
+        Forward pass of the encoder.
+
+        Parameters
+        ----------
+        x: torch.Tensor
+            (B, D, H, W)
+        t: torch.Tensor
+            (B)
+        encoder_hidden_states: torch.Tensor
+            CLIP text embedding, (B, L_clip, D)
+        text_embedding_mask: torch.Tensor
+            CLIP text embedding mask, (B, L_clip)
+        encoder_hidden_states_t5: torch.Tensor
+            T5 text embedding, (B, L_t5, D)
+        text_embedding_mask_t5: torch.Tensor
+            T5 text embedding mask, (B, L_t5)
+        image_meta_size: torch.Tensor
+            (B, 6)
+        style: torch.Tensor
+            (B)
+        cos_cis_img: torch.Tensor
+        sin_cis_img: torch.Tensor
+        return_dict: bool
+            Whether to return a dictionary.
+        """
+        #import pdb
+        #pdb.set_trace()
+        encoder_hidden_states = context
+        text_states = encoder_hidden_states                     # 2,77,1024
+        text_states_t5 = encoder_hidden_states_t5               # 2,256,2048
+        text_states_mask = text_embedding_mask.bool()           # 2,77
+        text_states_t5_mask = text_embedding_mask_t5.bool()     # 2,256
+        b_t5, l_t5, c_t5 = text_states_t5.shape
+        text_states_t5 = self.mlp_t5(text_states_t5.view(-1, c_t5)).view(b_t5, l_t5, -1)
+
+        padding = comfy.ops.cast_to_input(self.text_embedding_padding, text_states)
+
+        text_states[:,-self.text_len:] = torch.where(text_states_mask[:,-self.text_len:].unsqueeze(2), text_states[:,-self.text_len:], padding[:self.text_len])
+        text_states_t5[:,-self.text_len_t5:] = torch.where(text_states_t5_mask[:,-self.text_len_t5:].unsqueeze(2), text_states_t5[:,-self.text_len_t5:], padding[self.text_len:])
+
+        text_states = torch.cat([text_states, text_states_t5], dim=1)  # 2,205，1024
+        # clip_t5_mask = torch.cat([text_states_mask, text_states_t5_mask], dim=-1)
+
+        _, _, oh, ow = x.shape
+        th, tw = (oh + (self.patch_size // 2)) // self.patch_size, (ow + (self.patch_size // 2)) // self.patch_size
+
+
+        # Get image RoPE embedding according to `reso`lution.
+        freqs_cis_img = calc_rope(x, self.patch_size, self.hidden_size // self.num_heads) #(cos_cis_img, sin_cis_img)
+
+        # ========================= Build time and image embedding =========================
+        t = self.t_embedder(t, dtype=x.dtype)
+        x = self.x_embedder(x)
+
+        # ========================= Concatenate all extra vectors =========================
+        # Build text tokens with pooling
+        extra_vec = self.pooler(encoder_hidden_states_t5)
+
+        # Build image meta size tokens if applicable
+        if self.size_cond:
+            image_meta_size = timestep_embedding(image_meta_size.view(-1), 256).to(x.dtype)   # [B * 6, 256]
+            image_meta_size = image_meta_size.view(-1, 6 * 256)
+            extra_vec = torch.cat([extra_vec, image_meta_size], dim=1)  # [B, D + 6 * 256]
+
+        # Build style tokens
+        if self.use_style_cond:
+            if style is None:
+                style = torch.zeros((extra_vec.shape[0],), device=x.device, dtype=torch.int)
+            style_embedding = self.style_embedder(style, out_dtype=x.dtype)
+            extra_vec = torch.cat([extra_vec, style_embedding], dim=1)
+
+        # Concatenate all extra vectors
+        c = t + self.extra_embedder(extra_vec)  # [B, D]
+
+        controls = None
+        if control:
+            controls = control.get("output", None)
+        # ========================= Forward pass through HunYuanDiT blocks =========================
+        skips = []
+        for layer, block in enumerate(self.blocks):
+            if layer > self.depth // 2:
+                if controls is not None:
+                    skip = skips.pop() + controls.pop().to(dtype=x.dtype)
+                else:
+                    skip = skips.pop()
+                x = block(x, c, text_states, freqs_cis_img, skip)   # (N, L, D)
+            else:
+                x = block(x, c, text_states, freqs_cis_img)         # (N, L, D)
+
+            if layer < (self.depth // 2 - 1):
+                skips.append(x)
+        if controls is not None and len(controls) != 0:
+            raise ValueError("The number of controls is not equal to the number of skip connections.")
+
+        # ========================= Final layer =========================
+        x = self.final_layer(x, c)                              # (N, L, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x, th, tw)                          # (N, out_channels, H, W)
+
+        if return_dict:
+            return {'x': x}
+        if self.learn_sigma:
+            return x[:,:self.out_channels // 2,:oh,:ow]
+        return x[:,:,:oh,:ow]
+
+    def unpatchify(self, x, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        p = self.x_embedder.patch_size[0]
+        # h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
--- a/comfy/ldm/hydit/poolers.py
+++ b/comfy/ldm/hydit/poolers.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.ops
+
+class AttentionPool(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.empty(spacial_dim + 1, embed_dim, dtype=dtype, device=device))
+        self.k_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
+        self.q_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
+        self.v_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
+        self.c_proj = operations.Linear(embed_dim, output_dim or embed_dim, dtype=dtype, device=device)
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+
+    def forward(self, x):
+        x = x[:,:self.positional_embedding.shape[0] - 1]
+        x = x.permute(1, 0, 2)  # NLC -> LNC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
+        x = x + comfy.ops.cast_to_input(self.positional_embedding[:, None, :], x) # (L+1)NC
+
+        q = self.q_proj(x[:1])
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        batch_size = q.shape[1]
+        head_dim = self.embed_dim // self.num_heads
+        q = q.view(1, batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
+        k = k.view(k.shape[0], batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
+        v = v.view(v.shape[0], batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
+
+        attn_output = optimized_attention(q, k, v, self.num_heads, skip_reshape=True).transpose(0, 1)
+
+        attn_output = self.c_proj(attn_output)
+        return attn_output.squeeze(0)
--- a/comfy/ldm/hydit/posemb_layers.py
+++ b/comfy/ldm/hydit/posemb_layers.py
@@ -0,0 +1,224 @@
+import torch
+import numpy as np
+from typing import Union
+
+
+def _to_tuple(x):
+    if isinstance(x, int):
+        return x, x
+    else:
+        return x
+
+
+def get_fill_resize_and_crop(src, tgt):
+    th, tw = _to_tuple(tgt)
+    h, w = _to_tuple(src)
+
+    tr = th / tw        # base resolution
+    r = h / w           # target resolution
+
+    # resize
+    if r > tr:
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))    # resize the target resolution down based on the base resolution
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+def get_meshgrid(start, *args):
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start)
+        start = (0, 0)
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start)
+        stop = _to_tuple(args[0])
+        num = (stop[0] - start[0], stop[1] - start[1])
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start)
+        stop = _to_tuple(args[0])
+        num = _to_tuple(args[1])
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+    grid_h = np.linspace(start[0], stop[0], num[0], endpoint=False, dtype=np.float32)
+    grid_w = np.linspace(start[1], stop[1], num[1], endpoint=False, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)   # [2, W, H]
+    return grid
+
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+
+def get_2d_sincos_pos_embed(embed_dim, start, *args, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid = get_meshgrid(start, *args)   # [2, H, w]
+    # grid_h = np.arange(grid_size, dtype=np.float32)
+    # grid_w = np.arange(grid_size, dtype=np.float32)
+    # grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    # grid = np.stack(grid, axis=0)   # [2, W, H]
+
+    grid = grid.reshape([2, 1, *grid.shape[1:]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)    # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (W,H)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)   # (M, D/2)
+    emb_cos = np.cos(out)   # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/facebookresearch/llama/blob/main/llama/model.py#L443
+
+def get_2d_rotary_pos_embed(embed_dim, start, *args, use_real=True):
+    """
+    This is a 2d version of precompute_freqs_cis, which is a RoPE for image tokens with 2d structure.
+
+    Parameters
+    ----------
+    embed_dim: int
+        embedding dimension size
+    start: int or tuple of int
+        If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop, step is 1;
+        If len(args) == 2, start is start, args[0] is stop, args[1] is num.
+    use_real: bool
+        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+
+    Returns
+    -------
+    pos_embed: torch.Tensor
+        [HW, D/2]
+    """
+    grid = get_meshgrid(start, *args)   # [2, H, w]
+    grid = grid.reshape([2, 1, *grid.shape[1:]])   # Returns a sampling matrix with the same resolution as the target resolution
+    pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)
+    return pos_embed
+
+
+def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
+    assert embed_dim % 4 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_rotary_pos_embed(embed_dim // 2, grid[0].reshape(-1), use_real=use_real)  # (H*W, D/4)
+    emb_w = get_1d_rotary_pos_embed(embed_dim // 2, grid[1].reshape(-1), use_real=use_real)  # (H*W, D/4)
+
+    if use_real:
+        cos = torch.cat([emb_h[0], emb_w[0]], dim=1)    # (H*W, D/2)
+        sin = torch.cat([emb_h[1], emb_w[1]], dim=1)    # (H*W, D/2)
+        return cos, sin
+    else:
+        emb = torch.cat([emb_h, emb_w], dim=1)    # (H*W, D/2)
+        return emb
+
+
+def get_1d_rotary_pos_embed(dim: int, pos: Union[np.ndarray, int], theta: float = 10000.0, use_real=False):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (np.ndarray, int): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials. [S, D/2]
+
+    """
+    if isinstance(pos, int):
+        pos = np.arange(pos)
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [D/2]
+    t = torch.from_numpy(pos).to(freqs.device)  # type: ignore  # [S]
+    freqs = torch.outer(t, freqs).float()  # type: ignore   # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
+        return freqs_cis
+
+
+
+def calc_sizes(rope_img, patch_size, th, tw):
+    if rope_img == 'extend':
+        # Expansion mode
+        sub_args = [(th, tw)]
+    elif rope_img.startswith('base'):
+        # Based on the specified dimensions, other dimensions are obtained through interpolation.
+        base_size = int(rope_img[4:]) // 8 // patch_size
+        start, stop = get_fill_resize_and_crop((th, tw), base_size)
+        sub_args = [start, stop, (th, tw)]
+    else:
+        raise ValueError(f"Unknown rope_img: {rope_img}")
+    return sub_args
+
+
+def init_image_posemb(rope_img,
+                      resolutions,
+                      patch_size,
+                      hidden_size,
+                      num_heads,
+                      log_fn,
+                      rope_real=True,
+                      ):
+    freqs_cis_img = {}
+    for reso in resolutions:
+        th, tw = reso.height // 8 // patch_size, reso.width // 8 // patch_size
+        sub_args = calc_sizes(rope_img, patch_size, th, tw)
+        freqs_cis_img[str(reso)] = get_2d_rotary_pos_embed(hidden_size // num_heads, *sub_args, use_real=rope_real)
+        log_fn(f"    Using image RoPE ({rope_img}) ({'real' if rope_real else 'complex'}): {sub_args} | ({reso}) "
+               f"{freqs_cis_img[str(reso)][0].shape if rope_real else freqs_cis_img[str(reso)].shape}")
+    return freqs_cis_img
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@@ -0,0 +1,502 @@
+import torch
+from torch import nn
+import comfy.ldm.modules.attention
+from comfy.ldm.genmo.joint_model.layers import RMSNorm
+import comfy.ldm.common_dit
+from einops import rearrange
+import math
+from typing import Dict, Optional, Tuple
+
+from .symmetric_patchifier import SymmetricPatchifier
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+        dtype=None, device=None, operations=None,
+    ):
+        super().__init__()
+
+        self.linear_1 = operations.Linear(in_channels, time_embed_dim, sample_proj_bias, dtype=dtype, device=device)
+
+        if cond_proj_dim is not None:
+            self.cond_proj = operations.Linear(cond_proj_dim, in_channels, bias=False, dtype=dtype, device=device)
+        else:
+            self.cond_proj = None
+
+        self.act = nn.SiLU()
+
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias, dtype=dtype, device=device)
+
+        if post_act_fn is None:
+            self.post_act = None
+        # else:
+        #     self.post_act = get_activation(post_act_fn)
+
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+
+        if self.act is not None:
+            sample = self.act(sample)
+
+        sample = self.linear_2(sample)
+
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+
+
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+
+
+class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
+    """
+    For PixArt-Alpha.
+
+    Reference:
+    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+    """
+
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False, dtype=None, device=None, operations=None):
+        super().__init__()
+
+        self.outdim = size_emb_dim
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+        return timesteps_emb
+
+
+class AdaLayerNormSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False, dtype=None, device=None, operations=None):
+        super().__init__()
+
+        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions, dtype=dtype, device=device, operations=operations
+        )
+
+        self.silu = nn.SiLU()
+        self.linear = operations.Linear(embedding_dim, 6 * embedding_dim, bias=True, dtype=dtype, device=device)
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        batch_size: Optional[int] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # No modulation happening here.
+        added_cond_kwargs = added_cond_kwargs or {"resolution": None, "aspect_ratio": None}
+        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh", dtype=None, device=None, operations=None):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = operations.Linear(in_features=in_features, out_features=hidden_size, bias=True, dtype=dtype, device=device)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = operations.Linear(in_features=hidden_size, out_features=out_features, bias=True, dtype=dtype, device=device)
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class GELU_approx(nn.Module):
+    def __init__(self, dim_in, dim_out, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.proj = operations.Linear(dim_in, dim_out, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return torch.nn.functional.gelu(self.proj(x), approximate="tanh")
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out, mult=4, glu=False, dropout=0., dtype=None, device=None, operations=None):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        project_in = GELU_approx(dim, inner_dim, dtype=dtype, device=device, operations=operations)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            operations.Linear(inner_dim, dim_out, dtype=dtype, device=device)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def apply_rotary_emb(input_tensor, freqs_cis): #TODO: remove duplicate funcs and pick the best/fastest one
+    cos_freqs = freqs_cis[0]
+    sin_freqs = freqs_cis[1]
+
+    t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2)
+    t1, t2 = t_dup.unbind(dim=-1)
+    t_dup = torch.stack((-t2, t1), dim=-1)
+    input_tensor_rot = rearrange(t_dup, "... d r -> ... (d r)")
+
+    out = input_tensor * cos_freqs + input_tensor_rot * sin_freqs
+
+    return out
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., attn_precision=None, dtype=None, device=None, operations=None):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = query_dim if context_dim is None else context_dim
+        self.attn_precision = attn_precision
+
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.q_norm = RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.k_norm = RMSNorm(inner_dim, dtype=dtype, device=device)
+
+        self.to_q = operations.Linear(query_dim, inner_dim, bias=True, dtype=dtype, device=device)
+        self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
+        self.to_v = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
+
+        self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout))
+
+    def forward(self, x, context=None, mask=None, pe=None):
+        q = self.to_q(x)
+        context = x if context is None else context
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        if pe is not None:
+            q = apply_rotary_emb(q, pe)
+            k = apply_rotary_emb(k, pe)
+
+        if mask is None:
+            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision)
+        else:
+            out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision)
+        return self.to_out(out)
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, context_dim=None, attn_precision=None, dtype=None, device=None, operations=None):
+        super().__init__()
+
+        self.attn_precision = attn_precision
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, context_dim=None, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)
+        self.ff = FeedForward(dim, dim_out=dim, glu=True, dtype=dtype, device=device, operations=operations)
+
+        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, attn_precision=self.attn_precision, dtype=dtype, device=device, operations=operations)
+
+        self.scale_shift_table = nn.Parameter(torch.empty(6, dim, device=device, dtype=dtype))
+
+    def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None] + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2)
+
+        x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe) * gate_msa
+
+        x += self.attn2(x, context=context, mask=attention_mask)
+
+        y = comfy.ldm.common_dit.rms_norm(x) * (1 + scale_mlp) + shift_mlp
+        x += self.ff(y) * gate_mlp
+
+        return x
+
+def get_fractional_positions(indices_grid, max_pos):
+    fractional_positions = torch.stack(
+        [
+            indices_grid[:, i] / max_pos[i]
+            for i in range(3)
+        ],
+        dim=-1,
+    )
+    return fractional_positions
+
+
+def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=[20, 2048, 2048]):
+    dtype = torch.float32 #self.dtype
+
+    fractional_positions = get_fractional_positions(indices_grid, max_pos)
+
+    start = 1
+    end = theta
+    device = fractional_positions.device
+
+    indices = theta ** (
+        torch.linspace(
+            math.log(start, theta),
+            math.log(end, theta),
+            dim // 6,
+            device=device,
+            dtype=dtype,
+        )
+    )
+    indices = indices.to(dtype=dtype)
+
+    indices = indices * math.pi / 2
+
+    freqs = (
+        (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
+        .transpose(-1, -2)
+        .flatten(2)
+    )
+
+    cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
+    sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
+    if dim % 6 != 0:
+        cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6])
+        sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6])
+        cos_freq = torch.cat([cos_padding, cos_freq], dim=-1)
+        sin_freq = torch.cat([sin_padding, sin_freq], dim=-1)
+    return cos_freq.to(out_dtype), sin_freq.to(out_dtype)
+
+
+class LTXVModel(torch.nn.Module):
+    def __init__(self,
+                 in_channels=128,
+                 cross_attention_dim=2048,
+                 attention_head_dim=64,
+                 num_attention_heads=32,
+
+                 caption_channels=4096,
+                 num_layers=28,
+
+
+                 positional_embedding_theta=10000.0,
+                 positional_embedding_max_pos=[20, 2048, 2048],
+                 dtype=None, device=None, operations=None, **kwargs):
+        super().__init__()
+        self.dtype = dtype
+        self.out_channels = in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
+
+        self.adaln_single = AdaLayerNormSingle(
+            self.inner_dim, use_additional_conditions=False, dtype=dtype, device=device, operations=operations
+        )
+
+        # self.adaln_single.linear = operations.Linear(self.inner_dim, 4 * self.inner_dim, bias=True, dtype=dtype, device=device)
+
+        self.caption_projection = PixArtAlphaTextProjection(
+            in_features=caption_channels, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    context_dim=cross_attention_dim,
+                    # attn_precision=attn_precision,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        self.scale_shift_table = nn.Parameter(torch.empty(2, self.inner_dim, dtype=dtype, device=device))
+        self.norm_out = operations.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.proj_out = operations.Linear(self.inner_dim, self.out_channels, dtype=dtype, device=device)
+
+        self.patchifier = SymmetricPatchifier(1)
+
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, **kwargs):
+        indices_grid = self.patchifier.get_grid(
+            orig_num_frames=x.shape[2],
+            orig_height=x.shape[3],
+            orig_width=x.shape[4],
+            batch_size=x.shape[0],
+            scale_grid=((1 / frame_rate) * 8, 32, 32), #TODO: controlable frame rate
+            device=x.device,
+        )
+
+        if guiding_latent is not None:
+            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
+            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
+            ts *= input_ts
+            ts[:, :, 0] = 0.0
+            timestep = self.patchifier.patchify(ts)
+            input_x = x.clone()
+            x[:, :, 0] = guiding_latent[:, :, 0]
+
+        orig_shape = list(x.shape)
+
+        x = self.patchifier.patchify(x)
+
+        x = self.patchify_proj(x)
+        timestep = timestep * 1000.0
+
+        attention_mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1]))
+        attention_mask = attention_mask.masked_fill(attention_mask.to(torch.bool), float("-inf"))  # not sure about this
+        # attention_mask = (context != 0).any(dim=2).to(dtype=x.dtype)
+
+        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)
+
+        batch_size = x.shape[0]
+        timestep, embedded_timestep = self.adaln_single(
+            timestep.flatten(),
+            {"resolution": None, "aspect_ratio": None},
+            batch_size=batch_size,
+            hidden_dtype=x.dtype,
+        )
+        # Second dimension is 1 or number of tokens (if timestep_per_token)
+        timestep = timestep.view(batch_size, -1, timestep.shape[-1])
+        embedded_timestep = embedded_timestep.view(
+            batch_size, -1, embedded_timestep.shape[-1]
+        )
+
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = x.shape[0]
+            context = self.caption_projection(context)
+            context = context.view(
+                batch_size, -1, x.shape[-1]
+            )
+
+        for block in self.transformer_blocks:
+            x = block(
+                x,
+                context=context,
+                attention_mask=attention_mask,
+                timestep=timestep,
+                pe=pe
+            )
+
+        # 3. Output
+        scale_shift_values = (
+            self.scale_shift_table[None, None] + embedded_timestep[:, :, None]
+        )
+        shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]
+        x = self.norm_out(x)
+        # Modulation
+        x = x * (1 + scale) + shift
+        x = self.proj_out(x)
+
+        x = self.patchifier.unpatchify(
+            latents=x,
+            output_height=orig_shape[3],
+            output_width=orig_shape[4],
+            output_num_frames=orig_shape[2],
+            out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
+        )
+
+        if guiding_latent is not None:
+            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
+
+        # print("res", x)
+        return x
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@@ -0,0 +1,105 @@
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+import torch
+from einops import rearrange
+from torch import Tensor
+
+
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    elif dims_to_append == 0:
+        return x
+    return x[(...,) + (None,) * dims_to_append]
+
+
+class Patchifier(ABC):
+    def __init__(self, patch_size: int):
+        super().__init__()
+        self._patch_size = (1, patch_size, patch_size)
+
+    @abstractmethod
+    def patchify(
+        self, latents: Tensor, frame_rates: Tensor, scale_grid: bool
+    ) -> Tuple[Tensor, Tensor]:
+        pass
+
+    @abstractmethod
+    def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        output_num_frames: int,
+        out_channels: int,
+    ) -> Tuple[Tensor, Tensor]:
+        pass
+
+    @property
+    def patch_size(self):
+        return self._patch_size
+
+    def get_grid(
+        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
+    ):
+        f = orig_num_frames // self._patch_size[0]
+        h = orig_height // self._patch_size[1]
+        w = orig_width // self._patch_size[2]
+        grid_h = torch.arange(h, dtype=torch.float32, device=device)
+        grid_w = torch.arange(w, dtype=torch.float32, device=device)
+        grid_f = torch.arange(f, dtype=torch.float32, device=device)
+        grid = torch.meshgrid(grid_f, grid_h, grid_w)
+        grid = torch.stack(grid, dim=0)
+        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+
+        if scale_grid is not None:
+            for i in range(3):
+                if isinstance(scale_grid[i], Tensor):
+                    scale = append_dims(scale_grid[i], grid.ndim - 1)
+                else:
+                    scale = scale_grid[i]
+                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
+
+        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
+        return grid
+
+
+class SymmetricPatchifier(Patchifier):
+    def patchify(
+        self,
+        latents: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        latents = rearrange(
+            latents,
+            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
+            p1=self._patch_size[0],
+            p2=self._patch_size[1],
+            p3=self._patch_size[2],
+        )
+        return latents
+
+    def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        output_num_frames: int,
+        out_channels: int,
+    ) -> Tuple[Tensor, Tensor]:
+        output_height = output_height // self._patch_size[1]
+        output_width = output_width // self._patch_size[2]
+        latents = rearrange(
+            latents,
+            "b (f h w) (c p q) -> b c f (h p) (w q) ",
+            f=output_num_frames,
+            h=output_height,
+            w=output_width,
+            p=self._patch_size[1],
+            q=self._patch_size[2],
+        )
+        return latents
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@@ -0,0 +1,62 @@
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+
+
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.time_kernel_size = kernel_size[0]
+
+        dilation = (dilation, 1, 1)
+
+        height_pad = kernel_size[1] // 2
+        width_pad = kernel_size[2] // 2
+        padding = (0, height_pad, width_pad)
+
+        self.conv = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            padding_mode="zeros",
+            groups=groups,
+        )
+
+    def forward(self, x, causal: bool = True):
+        if causal:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, self.time_kernel_size - 1, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x), dim=2)
+        else:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            last_frame_pad = x[:, :, -1:, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
+        x = self.conv(x)
+        return x
+
+    @property
+    def weight(self):
+        return self.conv.weight
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -0,0 +1,698 @@
+import torch
+from torch import nn
+from functools import partial
+import math
+from einops import rearrange
+from typing import Any, Mapping, Optional, Tuple, Union, List
+from .conv_nd_factory import make_conv_nd, make_linear_nd
+from .pixel_norm import PixelNorm
+
+
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        latent_log_var (`str`, *optional*, defaults to `per_channel`):
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
+    """
+
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]] = 3,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks=[("res_x", 1)],
+        base_channels: int = 128,
+        norm_num_groups: int = 32,
+        patch_size: Union[int, Tuple[int]] = 1,
+        norm_layer: str = "group_norm",  # group_norm, pixel_norm
+        latent_log_var: str = "per_channel",
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.norm_layer = norm_layer
+        self.latent_channels = out_channels
+        self.latent_log_var = latent_log_var
+        self.blocks_desc = blocks
+
+        in_channels = in_channels * patch_size**2
+        output_channel = base_channels
+
+        self.conv_in = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+
+        self.down_blocks = nn.ModuleList([])
+
+        for block_name, block_params in blocks:
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "res_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "compress_time":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 1, 1),
+                    causal=True,
+                )
+            elif block_name == "compress_space":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(1, 2, 2),
+                    causal=True,
+                )
+            elif block_name == "compress_all":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                )
+            elif block_name == "compress_all_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                )
+            else:
+                raise ValueError(f"unknown block: {block_name}")
+
+            self.down_blocks.append(block)
+
+        # out
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+
+        self.conv_act = nn.SiLU()
+
+        conv_out_channels = out_channels
+        if latent_log_var == "per_channel":
+            conv_out_channels *= 2
+        elif latent_log_var == "uniform":
+            conv_out_channels += 1
+        elif latent_log_var != "none":
+            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+
+        sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+        sample = self.conv_in(sample)
+
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+
+        for down_block in self.down_blocks:
+            sample = checkpoint_fn(down_block)(sample)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if self.latent_log_var == "uniform":
+            last_channel = sample[:, -1:, ...]
+            num_dims = sample.dim()
+
+            if num_dims == 4:
+                # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            elif num_dims == 5:
+                # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            else:
+                raise ValueError(f"Invalid input shape: {sample.shape}")
+
+        return sample
+
+
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        causal (`bool`, *optional*, defaults to `True`):
+            Whether to use causal convolutions or not.
+    """
+
+    def __init__(
+        self,
+        dims,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks=[("res_x", 1)],
+        base_channels: int = 128,
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: int = 1,
+        norm_layer: str = "group_norm",
+        causal: bool = True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.layers_per_block = layers_per_block
+        out_channels = out_channels * patch_size**2
+        self.causal = causal
+        self.blocks_desc = blocks
+
+        # Compute output channel to be product of all channel-multiplier blocks
+        output_channel = base_channels
+        for block_name, block_params in list(reversed(blocks)):
+            block_params = block_params if isinstance(block_params, dict) else {}
+            if block_name == "res_x_y":
+                output_channel = output_channel * block_params.get("multiplier", 2)
+
+        self.conv_in = make_conv_nd(
+            dims,
+            in_channels,
+            output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+
+        self.up_blocks = nn.ModuleList([])
+
+        for block_name, block_params in list(reversed(blocks)):
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "res_x_y":
+                output_channel = output_channel // block_params.get("multiplier", 2)
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "compress_time":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
+                )
+            elif block_name == "compress_space":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
+                )
+            elif block_name == "compress_all":
+                block = DepthToSpaceUpsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(2, 2, 2),
+                    residual=block_params.get("residual", False),
+                )
+            else:
+                raise ValueError(f"unknown layer: {block_name}")
+
+            self.up_blocks.append(block)
+
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+
+        self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(
+            dims, output_channel, out_channels, 3, padding=1, causal=True
+        )
+
+        self.gradient_checkpointing = False
+
+    # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+        # assert target_shape is not None, "target_shape must be provided"
+
+        sample = self.conv_in(sample, causal=self.causal)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+
+        sample = sample.to(upscale_dtype)
+
+        for up_block in self.up_blocks:
+            sample = checkpoint_fn(up_block)(sample, causal=self.causal)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample, causal=self.causal)
+
+        sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+
+        return sample
+
+
+class UNetMidBlock3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+
+    """
+
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        self.res_blocks = nn.ModuleList(
+            [
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, causal: bool = True
+    ) -> torch.FloatTensor:
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states, causal=causal)
+
+        return hidden_states
+
+
+class DepthToSpaceUpsample(nn.Module):
+    def __init__(self, dims, in_channels, stride, residual=False):
+        super().__init__()
+        self.stride = stride
+        self.out_channels = math.prod(stride) * in_channels
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            causal=True,
+        )
+        self.residual = residual
+
+    def forward(self, x, causal: bool = True):
+        if self.residual:
+            # Reshape and duplicate the input to match the output shape
+            x_in = rearrange(
+                x,
+                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+                p1=self.stride[0],
+                p2=self.stride[1],
+                p3=self.stride[2],
+            )
+            x_in = x_in.repeat(1, math.prod(self.stride), 1, 1, 1)
+            if self.stride[0] == 2:
+                x_in = x_in[:, :, 1:, :, :]
+        x = self.conv(x, causal=causal)
+        x = rearrange(
+            x,
+            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        if self.stride[0] == 2:
+            x = x[:, :, 1:, :, :]
+        if self.residual:
+            x = x + x_in
+        return x
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, dim, eps, elementwise_affine=True) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+
+    def forward(self, x):
+        x = rearrange(x, "b c d h w -> b d h w c")
+        x = self.norm(x)
+        x = rearrange(x, "b d h w c -> b c d h w")
+        return x
+
+
+class ResnetBlock3D(nn.Module):
+    r"""
+    A Resnet block.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-6,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        if norm_layer == "group_norm":
+            self.norm1 = nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm1 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm1 = LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+
+        self.non_linearity = nn.SiLU()
+
+        self.conv1 = make_conv_nd(
+            dims,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+
+        if norm_layer == "group_norm":
+            self.norm2 = nn.GroupNorm(
+                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm2 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm2 = LayerNorm(out_channels, eps=eps, elementwise_affine=True)
+
+        self.dropout = torch.nn.Dropout(dropout)
+
+        self.conv2 = make_conv_nd(
+            dims,
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+
+        self.conv_shortcut = (
+            make_linear_nd(
+                dims=dims, in_channels=in_channels, out_channels=out_channels
+            )
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+
+        self.norm3 = (
+            LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        causal: bool = True,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+
+        hidden_states = self.non_linearity(hidden_states)
+
+        hidden_states = self.conv1(hidden_states, causal=causal)
+
+        hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.non_linearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = self.conv2(hidden_states, causal=causal)
+
+        input_tensor = self.norm3(input_tensor)
+
+        input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = input_tensor + hidden_states
+
+        return output_tensor
+
+
+def patchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+
+    return x
+
+
+def unpatchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+
+    return x
+
+class processor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("std-of-means", torch.empty(128))
+        self.register_buffer("mean-of-means", torch.empty(128))
+        self.register_buffer("mean-of-stds", torch.empty(128))
+        self.register_buffer("mean-of-stds_over_std-of-means", torch.empty(128))
+        self.register_buffer("channel", torch.empty(128))
+
+    def un_normalize(self, x):
+        return (x * self.get_buffer("std-of-means").view(1, -1, 1, 1, 1)) + self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1)
+
+    def normalize(self, x):
+        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1)
+
+class VideoVAE(nn.Module):
+    def __init__(self):
+        super().__init__()
+        config = {
+            "_class_name": "CausalVideoAutoencoder",
+            "dims": 3,
+            "in_channels": 3,
+            "out_channels": 3,
+            "latent_channels": 128,
+            "blocks": [
+                ["res_x", 4],
+                ["compress_all", 1],
+                ["res_x_y", 1],
+                ["res_x", 3],
+                ["compress_all", 1],
+                ["res_x_y", 1],
+                ["res_x", 3],
+                ["compress_all", 1],
+                ["res_x", 3],
+                ["res_x", 4],
+            ],
+            "scaling_factor": 1.0,
+            "norm_layer": "pixel_norm",
+            "patch_size": 4,
+            "latent_log_var": "uniform",
+            "use_quant_conv": False,
+            "causal_decoder": False,
+        }
+
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+
+        self.encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("blocks")),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+        )
+
+        self.decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("blocks")),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+        )
+
+        self.per_channel_statistics = processor()
+
+    def encode(self, x):
+        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
+        return self.per_channel_statistics.normalize(means)
+
+    def decode(self, x):
+        return self.decoder(self.per_channel_statistics.un_normalize(x))
+
--- a/comfy/ldm/lightricks/vae/conv_nd_factory.py
+++ b/comfy/ldm/lightricks/vae/conv_nd_factory.py
@@ -0,0 +1,82 @@
+from typing import Tuple, Union
+
+import torch
+
+from .dual_conv3d import DualConv3d
+from .causal_conv3d import CausalConv3d
+
+
+def make_conv_nd(
+    dims: Union[int, Tuple[int, int]],
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    bias=True,
+    causal=False,
+):
+    if dims == 2:
+        return torch.nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    elif dims == 3:
+        if causal:
+            return CausalConv3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+        return torch.nn.Conv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    elif dims == (2, 1):
+        return DualConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def make_linear_nd(
+    dims: int,
+    in_channels: int,
+    out_channels: int,
+    bias=True,
+):
+    if dims == 2:
+        return torch.nn.Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    elif dims == 3 or dims == (2, 1):
+        return torch.nn.Conv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")
--- a/comfy/ldm/lightricks/vae/dual_conv3d.py
+++ b/comfy/ldm/lightricks/vae/dual_conv3d.py
@@ -0,0 +1,195 @@
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+
+class DualConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        groups=1,
+        bias=True,
+    ):
+        super(DualConv3d, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        if kernel_size == (1, 1, 1):
+            raise ValueError(
+                "kernel_size must be greater than 1. Use make_linear_nd instead."
+            )
+        if isinstance(stride, int):
+            stride = (stride, stride, stride)
+        if isinstance(padding, int):
+            padding = (padding, padding, padding)
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation, dilation)
+
+        # Set parameters for convolutions
+        self.groups = groups
+        self.bias = bias
+
+        # Define the size of the channels after the first convolution
+        intermediate_channels = (
+            out_channels if in_channels < out_channels else in_channels
+        )
+
+        # Define parameters for the first convolution
+        self.weight1 = nn.Parameter(
+            torch.Tensor(
+                intermediate_channels,
+                in_channels // groups,
+                1,
+                kernel_size[1],
+                kernel_size[2],
+            )
+        )
+        self.stride1 = (1, stride[1], stride[2])
+        self.padding1 = (0, padding[1], padding[2])
+        self.dilation1 = (1, dilation[1], dilation[2])
+        if bias:
+            self.bias1 = nn.Parameter(torch.Tensor(intermediate_channels))
+        else:
+            self.register_parameter("bias1", None)
+
+        # Define parameters for the second convolution
+        self.weight2 = nn.Parameter(
+            torch.Tensor(
+                out_channels, intermediate_channels // groups, kernel_size[0], 1, 1
+            )
+        )
+        self.stride2 = (stride[0], 1, 1)
+        self.padding2 = (padding[0], 0, 0)
+        self.dilation2 = (dilation[0], 1, 1)
+        if bias:
+            self.bias2 = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter("bias2", None)
+
+        # Initialize weights and biases
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))
+        if self.bias:
+            fan_in1, _ = nn.init._calculate_fan_in_and_fan_out(self.weight1)
+            bound1 = 1 / math.sqrt(fan_in1)
+            nn.init.uniform_(self.bias1, -bound1, bound1)
+            fan_in2, _ = nn.init._calculate_fan_in_and_fan_out(self.weight2)
+            bound2 = 1 / math.sqrt(fan_in2)
+            nn.init.uniform_(self.bias2, -bound2, bound2)
+
+    def forward(self, x, use_conv3d=False, skip_time_conv=False):
+        if use_conv3d:
+            return self.forward_with_3d(x=x, skip_time_conv=skip_time_conv)
+        else:
+            return self.forward_with_2d(x=x, skip_time_conv=skip_time_conv)
+
+    def forward_with_3d(self, x, skip_time_conv):
+        # First convolution
+        x = F.conv3d(
+            x,
+            self.weight1,
+            self.bias1,
+            self.stride1,
+            self.padding1,
+            self.dilation1,
+            self.groups,
+        )
+
+        if skip_time_conv:
+            return x
+
+        # Second convolution
+        x = F.conv3d(
+            x,
+            self.weight2,
+            self.bias2,
+            self.stride2,
+            self.padding2,
+            self.dilation2,
+            self.groups,
+        )
+
+        return x
+
+    def forward_with_2d(self, x, skip_time_conv):
+        b, c, d, h, w = x.shape
+
+        # First 2D convolution
+        x = rearrange(x, "b c d h w -> (b d) c h w")
+        # Squeeze the depth dimension out of weight1 since it's 1
+        weight1 = self.weight1.squeeze(2)
+        # Select stride, padding, and dilation for the 2D convolution
+        stride1 = (self.stride1[1], self.stride1[2])
+        padding1 = (self.padding1[1], self.padding1[2])
+        dilation1 = (self.dilation1[1], self.dilation1[2])
+        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)
+
+        _, _, h, w = x.shape
+
+        if skip_time_conv:
+            x = rearrange(x, "(b d) c h w -> b c d h w", b=b)
+            return x
+
+        # Second convolution which is essentially treated as a 1D convolution across the 'd' dimension
+        x = rearrange(x, "(b d) c h w -> (b h w) c d", b=b)
+
+        # Reshape weight2 to match the expected dimensions for conv1d
+        weight2 = self.weight2.squeeze(-1).squeeze(-1)
+        # Use only the relevant dimension for stride, padding, and dilation for the 1D convolution
+        stride2 = self.stride2[0]
+        padding2 = self.padding2[0]
+        dilation2 = self.dilation2[0]
+        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
+        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)
+
+        return x
+
+    @property
+    def weight(self):
+        return self.weight2
+
+
+def test_dual_conv3d_consistency():
+    # Initialize parameters
+    in_channels = 3
+    out_channels = 5
+    kernel_size = (3, 3, 3)
+    stride = (2, 2, 2)
+    padding = (1, 1, 1)
+
+    # Create an instance of the DualConv3d class
+    dual_conv3d = DualConv3d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        bias=True,
+    )
+
+    # Example input tensor
+    test_input = torch.randn(1, 3, 10, 10, 10)
+
+    # Perform forward passes with both 3D and 2D settings
+    output_conv3d = dual_conv3d(test_input, use_conv3d=True)
+    output_2d = dual_conv3d(test_input, use_conv3d=False)
+
+    # Assert that the outputs from both methods are sufficiently close
+    assert torch.allclose(
+        output_conv3d, output_2d, atol=1e-6
+    ), "Outputs are not consistent between 3D and 2D convolutions."
--- a/comfy/ldm/lightricks/vae/pixel_norm.py
+++ b/comfy/ldm/lightricks/vae/pixel_norm.py
@@ -0,0 +1,12 @@
+import torch
+from torch import nn
+
+
+class PixelNorm(nn.Module):
+    def __init__(self, dim=1, eps=1e-8):
+        super(PixelNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+
+    def forward(self, x):
+        return x / torch.sqrt(torch.mean(x**2, dim=self.dim, keepdim=True) + self.eps)
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -299,7 +299,10 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
                    if len(mask.shape) == 2:
                        s1 += mask[i:end]
                    else:
-                        s1 += mask[:, i:end]
+                        if mask.shape[1] == 1:
+                            s1 += mask
+                        else:
+                            s1 += mask[:, i:end]

                s2 = s1.softmax(dim=-1).to(v.dtype)
                del s1
@@ -358,7 +361,7 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
            disabled_xformers = True

    if disabled_xformers:
-        return attention_pytorch(q, k, v, heads, mask)
+        return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape)

    if skip_reshape:
         q, k, v = map(
@@ -372,10 +375,10 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
        )

    if mask is not None:
-        pad = 8 - q.shape[1] % 8
-        mask_out = torch.empty([q.shape[0], q.shape[1], q.shape[1] + pad], dtype=q.dtype, device=q.device)
-        mask_out[:, :, :mask.shape[-1]] = mask
-        mask = mask_out[:, :, :mask.shape[-1]]
+        pad = 8 - mask.shape[-1] % 8
+        mask_out = torch.empty([q.shape[0], q.shape[2], q.shape[1], mask.shape[-1] + pad], dtype=q.dtype, device=q.device)
+        mask_out[..., :mask.shape[-1]] = mask
+        mask = mask_out[..., :mask.shape[-1]]

    out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask)

@@ -393,6 +396,13 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh

    return out

+if model_management.is_nvidia(): #pytorch 2.3 and up seem to have this issue.
+    SDP_BATCH_LIMIT = 2**15
+else:
+    #TODO: other GPUs ?
+    SDP_BATCH_LIMIT = 2**31
+
+
 def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
    if skip_reshape:
        b, _, _, dim_head = q.shape
@@ -404,10 +414,15 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
            (q, k, v),
        )

-    out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
-    out = (
-        out.transpose(1, 2).reshape(b, -1, heads * dim_head)
-    )
+    if SDP_BATCH_LIMIT >= q.shape[0]:
+        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+        out = (
+            out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+        )
+    else:
+        out = torch.empty((q.shape[0], q.shape[2], heads * dim_head), dtype=q.dtype, layout=q.layout, device=q.device)
+        for i in range(0, q.shape[0], SDP_BATCH_LIMIT):
+            out[i : i + SDP_BATCH_LIMIT] = torch.nn.functional.scaled_dot_product_attention(q[i : i + SDP_BATCH_LIMIT], k[i : i + SDP_BATCH_LIMIT], v[i : i + SDP_BATCH_LIMIT], attn_mask=mask, dropout_p=0.0, is_causal=False).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head)
    return out


--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -1,12 +1,15 @@
 import logging
 import math
-from typing import Dict, Optional
+from typing import Dict, Optional, List

 import numpy as np
 import torch
 import torch.nn as nn
-from .. import attention
+from ..attention import optimized_attention
 from einops import rearrange, repeat
+from .util import timestep_embedding
+import comfy.ops
+import comfy.ldm.common_dit

 def default(x, y):
    if x is not None:
@@ -68,12 +71,14 @@ class PatchEmbed(nn.Module):
            bias: bool = True,
            strict_img_size: bool = True,
            dynamic_img_pad: bool = True,
+            padding_mode='circular',
            dtype=None,
            device=None,
            operations=None,
    ):
        super().__init__()
        self.patch_size = (patch_size, patch_size)
+        self.padding_mode = padding_mode
        if img_size is not None:
            self.img_size = (img_size, img_size)
            self.grid_size = tuple([s // p for s, p in zip(self.img_size, self.patch_size)])
@@ -92,7 +97,7 @@ class PatchEmbed(nn.Module):
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
-        B, C, H, W = x.shape
+        # B, C, H, W = x.shape
        # if self.img_size is not None:
        #     if self.strict_img_size:
        #         _assert(H == self.img_size[0], f"Input height ({H}) doesn't match model ({self.img_size[0]}).")
@@ -107,9 +112,7 @@ class PatchEmbed(nn.Module):
        #             f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
        #         )
        if self.dynamic_img_pad:
-            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
-            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
-            x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
+            x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size, padding_mode=self.padding_mode)
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
@@ -230,34 +233,8 @@ class TimestepEmbedder(nn.Module):
        )
        self.frequency_embedding_size = frequency_embedding_size

-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10000):
-        """
-        Create sinusoidal timestep embeddings.
-        :param t: a 1-D Tensor of N indices, one per batch element.
-                          These may be fractional.
-        :param dim: the dimension of the output.
-        :param max_period: controls the minimum frequency of the embeddings.
-        :return: an (N, D) Tensor of positional embeddings.
-        """
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period)
-            * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
-            / half
-        )
-        args = t[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
-            )
-        if torch.is_floating_point(t):
-            embedding = embedding.to(dtype=t.dtype)
-        return embedding
-
    def forward(self, t, dtype, **kwargs):
-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype)
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype)
        t_emb = self.mlp(t_freq)
        return t_emb

@@ -289,8 +266,6 @@ def split_qkv(qkv, head_dim):
    qkv = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, -1, head_dim).movedim(2, 0)
    return qkv[0], qkv[1], qkv[2]

-def optimized_attention(qkv, num_heads):
-    return attention.optimized_attention(qkv[0], qkv[1], qkv[2], num_heads)

 class SelfAttention(nn.Module):
    ATTENTION_MODES = ("xformers", "torch", "torch-hb", "math", "debug")
@@ -349,9 +324,9 @@ class SelfAttention(nn.Module):
        return x

    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        qkv = self.pre_attention(x)
+        q, k, v = self.pre_attention(x)
        x = optimized_attention(
-            qkv, num_heads=self.num_heads
+            q, k, v, heads=self.num_heads
        )
        x = self.post_attention(x)
        return x
@@ -378,29 +353,9 @@ class RMSNorm(torch.nn.Module):
        else:
            self.register_parameter("weight", None)

-    def _norm(self, x):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The normalized tensor.
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
    def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-        """
-        x = self._norm(x)
-        if self.learnable_scale:
-            return x * self.weight.to(device=x.device, dtype=x.dtype)
-        else:
-            return x
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
+


 class SwiGLUFeedForward(nn.Module):
@@ -460,6 +415,7 @@ class DismantledBlock(nn.Module):
        scale_mod_only: bool = False,
        swiglu: bool = False,
        qk_norm: Optional[str] = None,
+        x_block_self_attn: bool = False,
        dtype=None,
        device=None,
        operations=None,
@@ -483,6 +439,24 @@ class DismantledBlock(nn.Module):
            device=device,
            operations=operations
        )
+        if x_block_self_attn:
+            assert not pre_only
+            assert not scale_mod_only
+            self.x_block_self_attn = True
+            self.attn2 = SelfAttention(
+                dim=hidden_size,
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                attn_mode=attn_mode,
+                pre_only=False,
+                qk_norm=qk_norm,
+                rmsnorm=rmsnorm,
+                dtype=dtype,
+                device=device,
+                operations=operations
+            )
+        else:
+            self.x_block_self_attn = False
        if not pre_only:
            if not rmsnorm:
                self.norm2 = operations.LayerNorm(
@@ -509,7 +483,11 @@ class DismantledBlock(nn.Module):
                    multiple_of=256,
                )
        self.scale_mod_only = scale_mod_only
-        if not scale_mod_only:
+        if x_block_self_attn:
+            assert not pre_only
+            assert not scale_mod_only
+            n_mods = 9
+        elif not scale_mod_only:
            n_mods = 6 if not pre_only else 2
        else:
            n_mods = 4 if not pre_only else 1
@@ -570,14 +548,64 @@ class DismantledBlock(nn.Module):
        )
        return x

+    def pre_attention_x(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        assert self.x_block_self_attn
+        (
+            shift_msa,
+            scale_msa,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+            shift_msa2,
+            scale_msa2,
+            gate_msa2,
+        ) = self.adaLN_modulation(c).chunk(9, dim=1)
+        x_norm = self.norm1(x)
+        qkv = self.attn.pre_attention(modulate(x_norm, shift_msa, scale_msa))
+        qkv2 = self.attn2.pre_attention(modulate(x_norm, shift_msa2, scale_msa2))
+        return qkv, qkv2, (
+            x,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+            gate_msa2,
+        )
+
+    def post_attention_x(self, attn, attn2, x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2):
+        assert not self.pre_only
+        attn1 = self.attn.post_attention(attn)
+        attn2 = self.attn2.post_attention(attn2)
+        out1 = gate_msa.unsqueeze(1) * attn1
+        out2 = gate_msa2.unsqueeze(1) * attn2
+        x = x + out1
+        x = x + out2
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(
+            modulate(self.norm2(x), shift_mlp, scale_mlp)
+        )
+        return x
+
    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
        assert not self.pre_only
-        qkv, intermediates = self.pre_attention(x, c)
-        attn = optimized_attention(
-            qkv,
-            num_heads=self.attn.num_heads,
-        )
-        return self.post_attention(attn, *intermediates)
+        if self.x_block_self_attn:
+            qkv, qkv2, intermediates = self.pre_attention_x(x, c)
+            attn, _ = optimized_attention(
+                qkv[0], qkv[1], qkv[2],
+                num_heads=self.attn.num_heads,
+            )
+            attn2, _ = optimized_attention(
+                qkv2[0], qkv2[1], qkv2[2],
+                num_heads=self.attn2.num_heads,
+            )
+            return self.post_attention_x(attn, attn2, *intermediates)
+        else:
+            qkv, intermediates = self.pre_attention(x, c)
+            attn = optimized_attention(
+                qkv[0], qkv[1], qkv[2],
+                heads=self.attn.num_heads,
+            )
+            return self.post_attention(attn, *intermediates)


 def block_mixing(*args, use_checkpoint=True, **kwargs):
@@ -592,7 +620,10 @@ def block_mixing(*args, use_checkpoint=True, **kwargs):
 def _block_mixing(context, x, context_block, x_block, c):
    context_qkv, context_intermediates = context_block.pre_attention(context, c)

-    x_qkv, x_intermediates = x_block.pre_attention(x, c)
+    if x_block.x_block_self_attn:
+        x_qkv, x_qkv2, x_intermediates = x_block.pre_attention_x(x, c)
+    else:
+        x_qkv, x_intermediates = x_block.pre_attention(x, c)

    o = []
    for t in range(3):
@@ -600,8 +631,8 @@ def _block_mixing(context, x, context_block, x_block, c):
    qkv = tuple(o)

    attn = optimized_attention(
-        qkv,
-        num_heads=x_block.attn.num_heads,
+        qkv[0], qkv[1], qkv[2],
+        heads=x_block.attn.num_heads,
    )
    context_attn, x_attn = (
        attn[:, : context_qkv[0].shape[1]],
@@ -613,7 +644,14 @@ def _block_mixing(context, x, context_block, x_block, c):

    else:
        context = None
-    x = x_block.post_attention(x_attn, *x_intermediates)
+    if x_block.x_block_self_attn:
+        attn2 = optimized_attention(
+                x_qkv2[0], x_qkv2[1], x_qkv2[2],
+                heads=x_block.attn2.num_heads,
+            )
+        x = x_block.post_attention_x(x_attn, attn2, *x_intermediates)
+    else:
+        x = x_block.post_attention(x_attn, *x_intermediates)
    return context, x


@@ -628,8 +666,13 @@ class JointBlock(nn.Module):
        super().__init__()
        pre_only = kwargs.pop("pre_only")
        qk_norm = kwargs.pop("qk_norm", None)
+        x_block_self_attn = kwargs.pop("x_block_self_attn", False)
        self.context_block = DismantledBlock(*args, pre_only=pre_only, qk_norm=qk_norm, **kwargs)
-        self.x_block = DismantledBlock(*args, pre_only=False, qk_norm=qk_norm, **kwargs)
+        self.x_block = DismantledBlock(*args,
+                                       pre_only=False,
+                                       qk_norm=qk_norm,
+                                       x_block_self_attn=x_block_self_attn,
+                                       **kwargs)

    def forward(self, *args, **kwargs):
        return block_mixing(
@@ -685,7 +728,7 @@ class SelfAttentionContext(nn.Module):
    def forward(self, x):
        qkv = self.qkv(x)
        q, k, v = split_qkv(qkv, self.dim_head)
-        x = optimized_attention((q.reshape(q.shape[0], q.shape[1], -1), k, v), self.heads)
+        x = optimized_attention(q.reshape(q.shape[0], q.shape[1], -1), k, v, heads=self.heads)
        return self.proj(x)

 class ContextProcessorBlock(nn.Module):
@@ -744,9 +787,12 @@ class MMDiT(nn.Module):
        qk_norm: Optional[str] = None,
        qkv_bias: bool = True,
        context_processor_layers = None,
+        x_block_self_attn: bool = False,
+        x_block_self_attn_layers: Optional[List[int]] = [],
        context_size = 4096,
        num_blocks = None,
        final_layer = True,
+        skip_blocks = False,
        dtype = None, #TODO
        device = None,
        operations = None,
@@ -761,6 +807,7 @@ class MMDiT(nn.Module):
        self.pos_embed_scaling_factor = pos_embed_scaling_factor
        self.pos_embed_offset = pos_embed_offset
        self.pos_embed_max_size = pos_embed_max_size
+        self.x_block_self_attn_layers = x_block_self_attn_layers

        # hidden_size = default(hidden_size, 64 * depth)
        # num_heads = default(num_heads, hidden_size // 64)
@@ -818,26 +865,28 @@ class MMDiT(nn.Module):
            self.pos_embed = None

        self.use_checkpoint = use_checkpoint
-        self.joint_blocks = nn.ModuleList(
-            [
-                JointBlock(
-                    self.hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    attn_mode=attn_mode,
-                    pre_only=(i == num_blocks - 1) and final_layer,
-                    rmsnorm=rmsnorm,
-                    scale_mod_only=scale_mod_only,
-                    swiglu=swiglu,
-                    qk_norm=qk_norm,
-                    dtype=dtype,
-                    device=device,
-                    operations=operations
-                )
-                for i in range(num_blocks)
-            ]
-        )
+        if not skip_blocks:
+            self.joint_blocks = nn.ModuleList(
+                [
+                    JointBlock(
+                        self.hidden_size,
+                        num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        attn_mode=attn_mode,
+                        pre_only=(i == num_blocks - 1) and final_layer,
+                        rmsnorm=rmsnorm,
+                        scale_mod_only=scale_mod_only,
+                        swiglu=swiglu,
+                        qk_norm=qk_norm,
+                        x_block_self_attn=(i in self.x_block_self_attn_layers) or x_block_self_attn,
+                        dtype=dtype,
+                        device=device,
+                        operations=operations,
+                    )
+                    for i in range(num_blocks)
+                ]
+            )

        if final_layer:
            self.final_layer = FinalLayer(self.hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
@@ -900,7 +949,9 @@ class MMDiT(nn.Module):
        c_mod: torch.Tensor,
        context: Optional[torch.Tensor] = None,
        control = None,
+        transformer_options = {},
    ) -> torch.Tensor:
+        patches_replace = transformer_options.get("patches_replace", {})
        if self.register_length > 0:
            context = torch.cat(
                (
@@ -912,14 +963,25 @@ class MMDiT(nn.Module):

        # context is B, L', D
        # x is B, L, D
+        blocks_replace = patches_replace.get("dit", {})
        blocks = len(self.joint_blocks)
        for i in range(blocks):
-            context, x = self.joint_blocks[i](
-                context,
-                x,
-                c=c_mod,
-                use_checkpoint=self.use_checkpoint,
-            )
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["txt"], out["img"] = self.joint_blocks[i](args["txt"], args["img"], c=args["vec"])
+                    return out
+
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": c_mod}, {"original_block": block_wrap})
+                context = out["txt"]
+                x = out["img"]
+            else:
+                context, x = self.joint_blocks[i](
+                    context,
+                    x,
+                    c=c_mod,
+                    use_checkpoint=self.use_checkpoint,
+                )
            if control is not None:
                control_o = control.get("output")
                if i < len(control_o):
@@ -937,6 +999,7 @@ class MMDiT(nn.Module):
        y: Optional[torch.Tensor] = None,
        context: Optional[torch.Tensor] = None,
        control = None,
+        transformer_options = {},
    ) -> torch.Tensor:
        """
        Forward pass of DiT.
@@ -949,7 +1012,7 @@ class MMDiT(nn.Module):
            context = self.context_processor(context)

        hw = x.shape[-2:]
-        x = self.x_embedder(x) + self.cropped_pos_embed(hw, device=x.device).to(dtype=x.dtype, device=x.device)
+        x = self.x_embedder(x) + comfy.ops.cast_to_input(self.cropped_pos_embed(hw, device=x.device), x)
        c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        if y is not None and self.y_embedder is not None:
            y = self.y_embedder(y)  # (N, D)
@@ -958,7 +1021,7 @@ class MMDiT(nn.Module):
        if context is not None:
            context = self.context_embedder(context)

-        x = self.forward_core_with_concat(x, c, context, control)
+        x = self.forward_core_with_concat(x, c, context, control, transformer_options)

        x = self.unpatchify(x, hw=hw)  # (N, out_channels, H, W)
        return x[:,:,:hw[-2],:hw[-1]]
@@ -972,7 +1035,8 @@ class OpenAISignatureMMDITWrapper(MMDiT):
        context: Optional[torch.Tensor] = None,
        y: Optional[torch.Tensor] = None,
        control = None,
+        transformer_options = {},
        **kwargs,
    ) -> torch.Tensor:
-        return super().forward(x, timesteps, context=context, y=y, control=control)
+        return super().forward(x, timesteps, context=context, y=y, control=control, transformer_options=transformer_options)

--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@@ -809,7 +809,7 @@ class UNetModel(nn.Module):
        self.out = nn.Sequential(
            operations.GroupNorm(32, ch, dtype=self.dtype, device=device),
            nn.SiLU(),
-            zero_module(operations.conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype, device=device)),
+            operations.conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype, device=device),
        )
        if self.predict_codebook_ids:
            self.id_predictor = nn.Sequential(
@@ -842,6 +842,11 @@ class UNetModel(nn.Module):
        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
        emb = self.time_embed(t_emb)

+        if "emb_patch" in transformer_patches:
+            patch = transformer_patches["emb_patch"]
+            for p in patch:
+                emb = p(emb, self.model_channels, transformer_options)
+
        if self.num_classes is not None:
            assert y.shape[0] == x.shape[0]
            emb = emb + self.label_emb(y)
--- a/comfy/ldm/modules/sub_quadratic_attention.py
+++ b/comfy/ldm/modules/sub_quadratic_attention.py
@@ -234,6 +234,8 @@ def efficient_dot_product_attention(
    def get_mask_chunk(chunk_idx: int) -> Tensor:
        if mask is None:
            return None
+        if mask.shape[1] == 1:
+            return mask
        chunk = min(query_chunk_size, q_tokens)
        return mask[:,chunk_idx:chunk_idx + chunk]

--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -1,5 +1,27 @@
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+from __future__ import annotations
 import comfy.utils
+import comfy.model_management
+import comfy.model_base
 import logging
+import torch

 LORA_CLIP_MAP = {
    "mlp.fc1": "mlp_fc1",
@@ -27,6 +49,15 @@ def load_lora(lora, to_load):
            dora_scale = lora[dora_scale_name]
            loaded_keys.add(dora_scale_name)

+        reshape_name = "{}.reshape_weight".format(x)
+        reshape = None
+        if reshape_name in lora.keys():
+            try:
+                reshape = lora[reshape_name].tolist()
+                loaded_keys.add(reshape_name)
+            except:
+                pass
+
        regular_lora = "{}.lora_up.weight".format(x)
        diffusers_lora = "{}_lora.up.weight".format(x)
        diffusers2_lora = "{}.lora_B.weight".format(x)
@@ -60,7 +91,7 @@ def load_lora(lora, to_load):
            if mid_name is not None and mid_name in lora.keys():
                mid = lora[mid_name]
                loaded_keys.add(mid_name)
-            patch_dict[to_load[x]] = ("lora", (lora[A_name], lora[B_name], alpha, mid, dora_scale))
+            patch_dict[to_load[x]] = ("lora", (lora[A_name], lora[B_name], alpha, mid, dora_scale, reshape))
            loaded_keys.add(A_name)
            loaded_keys.add(B_name)

@@ -171,6 +202,12 @@ def load_lora(lora, to_load):
            patch_dict["{}.bias".format(to_load[x][:-len(".weight")])] = ("diff", (diff_bias,))
            loaded_keys.add(diff_bias_name)

+        set_weight_name = "{}.set_weight".format(x)
+        set_weight = lora.get(set_weight_name, None)
+        if set_weight is not None:
+            patch_dict[to_load[x]] = ("set", (set_weight,))
+            loaded_keys.add(set_weight_name)
+
    for x in lora.keys():
        if x not in loaded_keys:
            logging.warning("lora key not loaded: {}".format(x))
@@ -179,9 +216,13 @@ def load_lora(lora, to_load):

 def model_lora_keys_clip(model, key_map={}):
    sdk = model.state_dict().keys()
+    for k in sdk:
+        if k.endswith(".weight"):
+            key_map["text_encoders.{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names

    text_model_lora_key = "lora_te_text_model_encoder_layers_{}_{}"
    clip_l_present = False
+    clip_g_present = False
    for b in range(32): #TODO: clean up
        for c in LORA_CLIP_MAP:
            k = "clip_h.transformer.text_model.encoder.layers.{}.{}.weight".format(b, c)
@@ -205,6 +246,7 @@ def model_lora_keys_clip(model, key_map={}):

            k = "clip_g.transformer.text_model.encoder.layers.{}.{}.weight".format(b, c)
            if k in sdk:
+                clip_g_present = True
                if clip_l_present:
                    lora_key = "lora_te2_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #SDXL base
                    key_map[lora_key] = k
@@ -218,11 +260,25 @@ def model_lora_keys_clip(model, key_map={}):
                    lora_key = "lora_prior_te_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #cascade lora: TODO put lora key prefix in the model config
                    key_map[lora_key] = k

-    for k in sdk: #OneTrainer SD3 lora
-        if k.startswith("t5xxl.transformer.") and k.endswith(".weight"):
-            l_key = k[len("t5xxl.transformer."):-len(".weight")]
-            lora_key = "lora_te3_{}".format(l_key.replace(".", "_"))
-            key_map[lora_key] = k
+    for k in sdk:
+        if k.endswith(".weight"):
+            if k.startswith("t5xxl.transformer."):#OneTrainer SD3 and Flux lora
+                l_key = k[len("t5xxl.transformer."):-len(".weight")]
+                t5_index = 1
+                if clip_g_present:
+                    t5_index += 1
+                if clip_l_present:
+                    t5_index += 1
+                    if t5_index == 2:
+                        key_map["lora_te{}_{}".format(t5_index, l_key.replace(".", "_"))] = k #OneTrainer Flux
+                        t5_index += 1
+
+                key_map["lora_te{}_{}".format(t5_index, l_key.replace(".", "_"))] = k
+            elif k.startswith("hydit_clip.transformer.bert."): #HunyuanDiT Lora
+                l_key = k[len("hydit_clip.transformer.bert."):-len(".weight")]
+                lora_key = "lora_te1_{}".format(l_key.replace(".", "_"))
+                key_map[lora_key] = k
+

    k = "clip_g.transformer.text_projection.weight"
    if k in sdk:
@@ -241,10 +297,14 @@ def model_lora_keys_unet(model, key_map={}):
    sdk = sd.keys()

    for k in sdk:
-        if k.startswith("diffusion_model.") and k.endswith(".weight"):
-            key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
-            key_map["lora_unet_{}".format(key_lora)] = k
-            key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config
+        if k.startswith("diffusion_model."):
+            if k.endswith(".weight"):
+                key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
+                key_map["lora_unet_{}".format(key_lora)] = k
+                key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config
+                key_map["{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
+            else:
+                key_map["{}".format(k)] = k #generic lora format for not .weight without any weird key names

    diffusers_keys = comfy.utils.unet_to_diffusers(model.model_config.unet_config)
    for k in diffusers_keys:
@@ -252,6 +312,7 @@ def model_lora_keys_unet(model, key_map={}):
            unet_key = "diffusion_model.{}".format(diffusers_keys[k])
            key_lora = k[:-len(".weight")].replace(".", "_")
            key_map["lora_unet_{}".format(key_lora)] = unet_key
+            key_map["lycoris_{}".format(key_lora)] = unet_key #simpletuner lycoris format

            diffusers_lora_prefix = ["", "unet."]
            for p in diffusers_lora_prefix:
@@ -274,4 +335,286 @@ def model_lora_keys_unet(model, key_map={}):
                key_lora = "lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_")) #OneTrainer lora
                key_map[key_lora] = to

+                key_lora = "lycoris_{}".format(k[:-len(".weight")].replace(".", "_")) #simpletuner lycoris format
+                key_map[key_lora] = to
+
+
+    if isinstance(model, comfy.model_base.AuraFlow): #Diffusers lora AuraFlow
+        diffusers_keys = comfy.utils.auraflow_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
+        for k in diffusers_keys:
+            if k.endswith(".weight"):
+                to = diffusers_keys[k]
+                key_lora = "transformer.{}".format(k[:-len(".weight")]) #simpletrainer and probably regular diffusers lora format
+                key_map[key_lora] = to
+
+    if isinstance(model, comfy.model_base.HunyuanDiT):
+        for k in sdk:
+            if k.startswith("diffusion_model.") and k.endswith(".weight"):
+                key_lora = k[len("diffusion_model."):-len(".weight")]
+                key_map["base_model.model.{}".format(key_lora)] = k #official hunyuan lora format
+
+    if isinstance(model, comfy.model_base.Flux): #Diffusers lora Flux
+        diffusers_keys = comfy.utils.flux_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
+        for k in diffusers_keys:
+            if k.endswith(".weight"):
+                to = diffusers_keys[k]
+                key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format
+                key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris
+                key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer
+
    return key_map
+
+
+def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function):
+    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, intermediate_dtype)
+    lora_diff *= alpha
+    weight_calc = weight + function(lora_diff).type(weight.dtype)
+    weight_norm = (
+        weight_calc.transpose(0, 1)
+        .reshape(weight_calc.shape[1], -1)
+        .norm(dim=1, keepdim=True)
+        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
+        .transpose(0, 1)
+    )
+
+    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
+    if strength != 1.0:
+        weight_calc -= weight
+        weight += strength * (weight_calc)
+    else:
+        weight[:] = weight_calc
+    return weight
+
+def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Tensor:
+    """
+    Pad a tensor to a new shape with zeros.
+
+    Args:
+        tensor (torch.Tensor): The original tensor to be padded.
+        new_shape (List[int]): The desired shape of the padded tensor.
+
+    Returns:
+        torch.Tensor: A new tensor padded with zeros to the specified shape.
+
+    Note:
+        If the new shape is smaller than the original tensor in any dimension,
+        the original tensor will be truncated in that dimension.
+    """
+    if any([new_shape[i] < tensor.shape[i] for i in range(len(new_shape))]):
+        raise ValueError("The new shape must be larger than the original tensor in all dimensions")
+
+    if len(new_shape) != len(tensor.shape):
+        raise ValueError("The new shape must have the same number of dimensions as the original tensor")
+
+    # Create a new tensor filled with zeros
+    padded_tensor = torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device)
+
+    # Create slicing tuples for both tensors
+    orig_slices = tuple(slice(0, dim) for dim in tensor.shape)
+    new_slices = tuple(slice(0, dim) for dim in tensor.shape)
+
+    # Copy the original tensor into the new tensor
+    padded_tensor[new_slices] = tensor[orig_slices]
+
+    return padded_tensor
+
+def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32):
+    for p in patches:
+        strength = p[0]
+        v = p[1]
+        strength_model = p[2]
+        offset = p[3]
+        function = p[4]
+        if function is None:
+            function = lambda a: a
+
+        old_weight = None
+        if offset is not None:
+            old_weight = weight
+            weight = weight.narrow(offset[0], offset[1], offset[2])
+
+        if strength_model != 1.0:
+            weight *= strength_model
+
+        if isinstance(v, list):
+            v = (calculate_weight(v[1:], v[0][1](comfy.model_management.cast_to_device(v[0][0], weight.device, intermediate_dtype, copy=True), inplace=True), key, intermediate_dtype=intermediate_dtype), )
+
+        if len(v) == 1:
+            patch_type = "diff"
+        elif len(v) == 2:
+            patch_type = v[0]
+            v = v[1]
+
+        if patch_type == "diff":
+            diff: torch.Tensor = v[0]
+            # An extra flag to pad the weight if the diff's shape is larger than the weight
+            do_pad_weight = len(v) > 1 and v[1]['pad_weight']
+            if do_pad_weight and diff.shape != weight.shape:
+                logging.info("Pad weight {} from {} to shape: {}".format(key, weight.shape, diff.shape))
+                weight = pad_tensor_to_shape(weight, diff.shape)
+
+            if strength != 0.0:
+                if diff.shape != weight.shape:
+                    logging.warning("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, diff.shape, weight.shape))
+                else:
+                    weight += function(strength * comfy.model_management.cast_to_device(diff, weight.device, weight.dtype))
+        elif patch_type == "set":
+            weight.copy_(v[0])
+        elif patch_type == "lora": #lora/locon
+            mat1 = comfy.model_management.cast_to_device(v[0], weight.device, intermediate_dtype)
+            mat2 = comfy.model_management.cast_to_device(v[1], weight.device, intermediate_dtype)
+            dora_scale = v[4]
+            reshape = v[5]
+
+            if reshape is not None:
+                weight = pad_tensor_to_shape(weight, reshape)
+
+            if v[2] is not None:
+                alpha = v[2] / mat2.shape[0]
+            else:
+                alpha = 1.0
+
+            if v[3] is not None:
+                #locon mid weights, hopefully the math is fine because I didn't properly test it
+                mat3 = comfy.model_management.cast_to_device(v[3], weight.device, intermediate_dtype)
+                final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
+                mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
+            try:
+                lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
+        elif patch_type == "lokr":
+            w1 = v[0]
+            w2 = v[1]
+            w1_a = v[3]
+            w1_b = v[4]
+            w2_a = v[5]
+            w2_b = v[6]
+            t2 = v[7]
+            dora_scale = v[8]
+            dim = None
+
+            if w1 is None:
+                dim = w1_b.shape[0]
+                w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, intermediate_dtype),
+                                comfy.model_management.cast_to_device(w1_b, weight.device, intermediate_dtype))
+            else:
+                w1 = comfy.model_management.cast_to_device(w1, weight.device, intermediate_dtype)
+
+            if w2 is None:
+                dim = w2_b.shape[0]
+                if t2 is None:
+                    w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype))
+                else:
+                    w2 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                        comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
+                                        comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype),
+                                        comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype))
+            else:
+                w2 = comfy.model_management.cast_to_device(w2, weight.device, intermediate_dtype)
+
+            if len(w2.shape) == 4:
+                w1 = w1.unsqueeze(2).unsqueeze(2)
+            if v[2] is not None and dim is not None:
+                alpha = v[2] / dim
+            else:
+                alpha = 1.0
+
+            try:
+                lora_diff = torch.kron(w1, w2).reshape(weight.shape)
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
+        elif patch_type == "loha":
+            w1a = v[0]
+            w1b = v[1]
+            if v[2] is not None:
+                alpha = v[2] / w1b.shape[0]
+            else:
+                alpha = 1.0
+
+            w2a = v[3]
+            w2b = v[4]
+            dora_scale = v[7]
+            if v[5] is not None: #cp decomposition
+                t1 = v[5]
+                t2 = v[6]
+                m1 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                    comfy.model_management.cast_to_device(t1, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype))
+
+                m2 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                    comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype))
+            else:
+                m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype),
+                                comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype))
+                m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype),
+                                comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype))
+
+            try:
+                lora_diff = (m1 * m2).reshape(weight.shape)
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
+        elif patch_type == "glora":
+            dora_scale = v[5]
+
+            old_glora = False
+            if v[3].shape[1] == v[2].shape[0] == v[0].shape[0] == v[1].shape[1]:
+                rank = v[0].shape[0]
+                old_glora = True
+
+            if v[3].shape[0] == v[2].shape[1] == v[0].shape[1] == v[1].shape[0]:
+                if old_glora and v[1].shape[0] == weight.shape[0] and weight.shape[0] == weight.shape[1]:
+                    pass
+                else:
+                    old_glora = False
+                    rank = v[1].shape[0]
+
+            a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, intermediate_dtype)
+            a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, intermediate_dtype)
+            b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, intermediate_dtype)
+            b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, intermediate_dtype)
+
+            if v[4] is not None:
+                alpha = v[4] / rank
+            else:
+                alpha = 1.0
+
+            try:
+                if old_glora:
+                    lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2), a1)).reshape(weight.shape) #old lycoris glora
+                else:
+                    if weight.dim() > 2:
+                        lora_diff = torch.einsum("o i ..., i j -> o j ...", torch.einsum("o i ..., i j -> o j ...", weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
+                    else:
+                        lora_diff = torch.mm(torch.mm(weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
+                    lora_diff += torch.mm(b1, b2).reshape(weight.shape)
+
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
+        else:
+            logging.warning("patch type not recognized {} {}".format(patch_type, key))
+
+        if old_weight is not None:
+            weight = old_weight
+
+    return weight
--- a/comfy/lora_convert.py
+++ b/comfy/lora_convert.py
@@ -0,0 +1,17 @@
+import torch
+
+
+def convert_lora_bfl_control(sd): #BFL loras for Flux
+    sd_out = {}
+    for k in sd:
+        k_to = "diffusion_model.{}".format(k.replace(".lora_B.bias", ".diff_b").replace("_norm.scale", "_norm.scale.set_weight"))
+        sd_out[k_to] = sd[k]
+
+    sd_out["diffusion_model.img_in.reshape_weight"] = torch.tensor([sd["img_in.lora_B.weight"].shape[0], sd["img_in.lora_A.weight"].shape[1]])
+    return sd_out
+
+
+def convert_lora(sd):
+    if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
+        return convert_lora_bfl_control(sd)
+    return sd
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1,3 +1,21 @@
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
 import torch
 import logging
 from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
@@ -6,9 +24,14 @@ from comfy.ldm.cascade.stage_b import StageB
 from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
 from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
 from comfy.ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper
+import comfy.ldm.genmo.joint_model.asymm_models_joint
 import comfy.ldm.aura.mmdit
+import comfy.ldm.hydit.models
 import comfy.ldm.audio.dit
 import comfy.ldm.audio.embedders
+import comfy.ldm.flux.model
+import comfy.ldm.lightricks.model
+
 import comfy.model_management
 import comfy.conds
 import comfy.ops
@@ -25,6 +48,7 @@ class ModelType(Enum):
    EDM = 5
    FLOW = 6
    V_PREDICTION_CONTINUOUS = 7
+    FLUX = 8


 from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
@@ -52,6 +76,9 @@ def model_sampling(model_config, model_type):
    elif model_type == ModelType.V_PREDICTION_CONTINUOUS:
        c = V_PREDICTION
        s = ModelSamplingContinuousV
+    elif model_type == ModelType.FLUX:
+        c = comfy.model_sampling.CONST
+        s = comfy.model_sampling.ModelSamplingFlux

    class ModelSampling(s, c):
        pass
@@ -67,16 +94,19 @@ class BaseModel(torch.nn.Module):
        self.latent_format = model_config.latent_format
        self.model_config = model_config
        self.manual_cast_dtype = model_config.manual_cast_dtype
+        self.device = device

        if not unet_config.get("disable_unet_model_creation", False):
-            if self.manual_cast_dtype is not None:
-                operations = comfy.ops.manual_cast
+            if model_config.custom_operations is None:
+                fp8 = model_config.optimizations.get("fp8", model_config.scaled_fp8 is not None)
+                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
            else:
-                operations = comfy.ops.disable_weight_init
+                operations = model_config.custom_operations
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
            if comfy.model_management.force_channels_last():
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
+            logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
        self.model_type = model_type
        self.model_sampling = model_sampling(model_config, model_type)

@@ -87,6 +117,7 @@ class BaseModel(torch.nn.Module):
        self.concat_keys = ()
        logging.info("model_type {}".format(model_type.name))
        logging.debug("adm {}".format(self.adm_channels))
+        self.memory_usage_factor = model_config.memory_usage_factor

    def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        sigma = t
@@ -123,8 +154,7 @@ class BaseModel(torch.nn.Module):
    def encode_adm(self, **kwargs):
        return None

-    def extra_conds(self, **kwargs):
-        out = {}
+    def concat_cond(self, **kwargs):
        if len(self.concat_keys) > 0:
            cond_concat = []
            denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
@@ -163,7 +193,14 @@ class BaseModel(torch.nn.Module):
                    elif ck == "masked_image":
                        cond_concat.append(self.blank_inpaint_image_like(noise))
            data = torch.cat(cond_concat, dim=1)
-            out['c_concat'] = comfy.conds.CONDNoiseShape(data)
+            return data
+        return None
+
+    def extra_conds(self, **kwargs):
+        out = {}
+        concat_cond = self.concat_cond(**kwargs)
+        if concat_cond is not None:
+            out['c_concat'] = comfy.conds.CONDNoiseShape(concat_cond)

        adm = self.encode_adm(**kwargs)
        if adm is not None:
@@ -216,6 +253,10 @@ class BaseModel(torch.nn.Module):
            extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))

        unet_state_dict = self.diffusion_model.state_dict()
+
+        if self.model_config.scaled_fp8 is not None:
+            unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8)
+
        unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)

        if self.model_type == ModelType.V_PREDICTION:
@@ -245,11 +286,11 @@ class BaseModel(torch.nn.Module):
                dtype = self.manual_cast_dtype
            #TODO: this needs to be tweaked
            area = input_shape[0] * math.prod(input_shape[2:])
-            return (area * comfy.model_management.dtype_size(dtype) / 50) * (1024 * 1024)
+            return (area * comfy.model_management.dtype_size(dtype) * 0.01 * self.memory_usage_factor) * (1024 * 1024)
        else:
            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
            area = input_shape[0] * math.prod(input_shape[2:])
-            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
+            return (area * 0.15 * self.memory_usage_factor) * (1024 * 1024)


 def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
@@ -347,6 +388,7 @@ class SDXL(BaseModel):
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)

+
 class SVD_img2vid(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
        super().__init__(model_config, model_type, device=device)
@@ -488,9 +530,7 @@ class SD_X4Upscaler(BaseModel):
        return out

 class IP2P:
-    def extra_conds(self, **kwargs):
-        out = {}
-
+    def concat_cond(self, **kwargs):
        image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        device = kwargs["device"]
@@ -502,18 +542,15 @@ class IP2P:
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")

        image = utils.resize_to_batch_size(image, noise.shape[0])
+        return self.process_ip2p_image_in(image)

-        out['c_concat'] = comfy.conds.CONDNoiseShape(self.process_ip2p_image_in(image))
-        adm = self.encode_adm(**kwargs)
-        if adm is not None:
-            out['y'] = comfy.conds.CONDRegular(adm)
-        return out

 class SD15_instructpix2pix(IP2P, BaseModel):
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
        self.process_ip2p_image_in = lambda image: image

+
 class SDXL_instructpix2pix(IP2P, SDXL):
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
@@ -587,17 +624,6 @@ class SD3(BaseModel):
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out

-    def memory_required(self, input_shape):
-        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
-            dtype = self.get_dtype()
-            if self.manual_cast_dtype is not None:
-                dtype = self.manual_cast_dtype
-            #TODO: this probably needs to be tweaked
-            area = input_shape[0] * input_shape[2] * input_shape[3]
-            return (area * comfy.model_management.dtype_size(dtype) * 0.012) * (1024 * 1024)
-        else:
-            area = input_shape[0] * input_shape[2] * input_shape[3]
-            return (area * 0.3) * (1024 * 1024)

 class AuraFlow(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
@@ -648,3 +674,117 @@ class StableAudio1(BaseModel):
            for l in s:
                sd["{}{}".format(k, l)] = s[l]
        return sd
+
+class HunyuanDiT(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hydit.models.HunYuanDiT)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['text_embedding_mask'] = comfy.conds.CONDRegular(attention_mask)
+
+        conditioning_mt5xl = kwargs.get("conditioning_mt5xl", None)
+        if conditioning_mt5xl is not None:
+            out['encoder_hidden_states_t5'] = comfy.conds.CONDRegular(conditioning_mt5xl)
+
+        attention_mask_mt5xl = kwargs.get("attention_mask_mt5xl", None)
+        if attention_mask_mt5xl is not None:
+            out['text_embedding_mask_t5'] = comfy.conds.CONDRegular(attention_mask_mt5xl)
+
+        width = kwargs.get("width", 768)
+        height = kwargs.get("height", 768)
+        crop_w = kwargs.get("crop_w", 0)
+        crop_h = kwargs.get("crop_h", 0)
+        target_width = kwargs.get("target_width", width)
+        target_height = kwargs.get("target_height", height)
+
+        out['image_meta_size'] = comfy.conds.CONDRegular(torch.FloatTensor([[height, width, target_height, target_width, 0, 0]]))
+        return out
+
+class Flux(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)
+
+    def concat_cond(self, **kwargs):
+        num_channels = self.diffusion_model.img_in.weight.shape[1] // (self.diffusion_model.patch_size * self.diffusion_model.patch_size)
+        out_channels = self.model_config.unet_config["out_channels"]
+
+        if num_channels <= out_channels:
+            return None
+
+        image = kwargs.get("concat_latent_image", None)
+        noise = kwargs.get("noise", None)
+        device = kwargs["device"]
+
+        if image is None:
+            image = torch.zeros_like(noise)
+
+        image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+        image = utils.resize_to_batch_size(image, noise.shape[0])
+        image = self.process_latent_in(image)
+        if num_channels <= out_channels * 2:
+            return image
+
+        #inpaint model
+        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
+        if mask is None:
+            mask = torch.ones_like(noise)[:, :1]
+
+        mask = torch.mean(mask, dim=1, keepdim=True)
+        print(mask.shape)
+        mask = utils.common_upscale(mask.to(device), noise.shape[-1] * 8, noise.shape[-2] * 8, "bilinear", "center")
+        mask = mask.view(mask.shape[0], mask.shape[2] // 8, 8, mask.shape[3] // 8, 8).permute(0, 2, 4, 1, 3).reshape(mask.shape[0], -1, mask.shape[2] // 8, mask.shape[3] // 8)
+        mask = utils.resize_to_batch_size(mask, noise.shape[0])
+        return torch.cat((image, mask), dim=1)
+
+    def encode_adm(self, **kwargs):
+        return kwargs["pooled_output"]
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([kwargs.get("guidance", 3.5)]))
+        return out
+
+class GenmoMochi(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.genmo.joint_model.asymm_models_joint.AsymmDiTJoint)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+            out['num_tokens'] = comfy.conds.CONDConstant(max(1, torch.sum(attention_mask).item()))
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        return out
+
+class LTXV(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lightricks.model.LTXVModel) #TODO
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        guiding_latent = kwargs.get("guiding_latent", None)
+        if guiding_latent is not None:
+            out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
+
+        out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
+        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -70,6 +70,11 @@ def detect_unet_config(state_dict, key_prefix):
        context_processor = '{}context_processor.layers.0.attn.qkv.weight'.format(key_prefix)
        if context_processor in state_dict_keys:
            unet_config["context_processor_layers"] = count_blocks(state_dict_keys, '{}context_processor.layers.'.format(key_prefix) + '{}.')
+        unet_config["x_block_self_attn_layers"] = []
+        for key in state_dict_keys:
+            if key.startswith('{}joint_blocks.'.format(key_prefix)) and key.endswith('.x_block.attn2.qkv.weight'):
+                layer = key[len('{}joint_blocks.'.format(key_prefix)):-len('.x_block.attn2.qkv.weight')]
+                unet_config["x_block_self_attn_layers"].append(int(layer))
        return unet_config

    if '{}clf.1.weight'.format(key_prefix) in state_dict_keys: #stable cascade
@@ -109,8 +114,80 @@ def detect_unet_config(state_dict, key_prefix):
        unet_config = {}
        unet_config["max_seq"] = state_dict['{}positional_encoding'.format(key_prefix)].shape[1]
        unet_config["cond_seq_dim"] = state_dict['{}cond_seq_linear.weight'.format(key_prefix)].shape[1]
+        double_layers = count_blocks(state_dict_keys, '{}double_layers.'.format(key_prefix) + '{}.')
+        single_layers = count_blocks(state_dict_keys, '{}single_layers.'.format(key_prefix) + '{}.')
+        unet_config["n_double_layers"] = double_layers
+        unet_config["n_layers"] = double_layers + single_layers
        return unet_config

+    if '{}mlp_t5.0.weight'.format(key_prefix) in state_dict_keys: #Hunyuan DiT
+        unet_config = {}
+        unet_config["image_model"] = "hydit"
+        unet_config["depth"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
+        unet_config["hidden_size"] = state_dict['{}x_embedder.proj.weight'.format(key_prefix)].shape[0]
+        if unet_config["hidden_size"] == 1408 and unet_config["depth"] == 40: #DiT-g/2
+            unet_config["mlp_ratio"] = 4.3637
+        if state_dict['{}extra_embedder.0.weight'.format(key_prefix)].shape[1] == 3968:
+            unet_config["size_cond"] = True
+            unet_config["use_style_cond"] = True
+            unet_config["image_model"] = "hydit1"
+        return unet_config
+
+    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys: #Flux
+        dit_config = {}
+        dit_config["image_model"] = "flux"
+        dit_config["in_channels"] = 16
+        patch_size = 2
+        dit_config["patch_size"] = patch_size
+        in_key = "{}img_in.weight".format(key_prefix)
+        if in_key in state_dict_keys:
+            dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size)
+        dit_config["out_channels"] = 16
+        dit_config["vec_in_dim"] = 768
+        dit_config["context_in_dim"] = 4096
+        dit_config["hidden_size"] = 3072
+        dit_config["mlp_ratio"] = 4.0
+        dit_config["num_heads"] = 24
+        dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
+        dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
+        dit_config["axes_dim"] = [16, 56, 56]
+        dit_config["theta"] = 10000
+        dit_config["qkv_bias"] = True
+        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
+        return dit_config
+
+    if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview
+        dit_config = {}
+        dit_config["image_model"] = "mochi_preview"
+        dit_config["depth"] = 48
+        dit_config["patch_size"] = 2
+        dit_config["num_heads"] = 24
+        dit_config["hidden_size_x"] = 3072
+        dit_config["hidden_size_y"] = 1536
+        dit_config["mlp_ratio_x"] = 4.0
+        dit_config["mlp_ratio_y"] = 4.0
+        dit_config["learn_sigma"] = False
+        dit_config["in_channels"] = 12
+        dit_config["qk_norm"] = True
+        dit_config["qkv_bias"] = False
+        dit_config["out_bias"] = True
+        dit_config["attn_drop"] = 0.0
+        dit_config["patch_embed_bias"] = True
+        dit_config["posenc_preserve_area"] = True
+        dit_config["timestep_mlp_bias"] = True
+        dit_config["attend_to_padding"] = False
+        dit_config["timestep_scale"] = 1000.0
+        dit_config["use_t5"] = True
+        dit_config["t5_feat_dim"] = 4096
+        dit_config["t5_token_length"] = 256
+        dit_config["rope_theta"] = 10000.0
+        return dit_config
+
+    if '{}adaln_single.emb.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: #Lightricks ltxv
+        dit_config = {}
+        dit_config["image_model"] = "ltxv"
+        return dit_config
+
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None

@@ -252,18 +329,34 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
        return None
    model_config = model_config_from_unet_config(unet_config, state_dict)
    if model_config is None and use_base_if_no_match:
-        return comfy.supported_models_base.BASE(unet_config)
-    else:
-        return model_config
+        model_config = comfy.supported_models_base.BASE(unet_config)
+
+    scaled_fp8_key = "{}scaled_fp8".format(unet_key_prefix)
+    if scaled_fp8_key in state_dict:
+        scaled_fp8_weight = state_dict.pop(scaled_fp8_key)
+        model_config.scaled_fp8 = scaled_fp8_weight.dtype
+        if model_config.scaled_fp8 == torch.float32:
+            model_config.scaled_fp8 = torch.float8_e4m3fn
+
+    return model_config

 def unet_prefix_from_state_dict(state_dict):
-    if "model.model.postprocess_conv.weight" in state_dict: #audio models
-        unet_key_prefix = "model.model."
-    elif "model.double_layers.0.attn.w1q.weight" in state_dict: #aura flow
-        unet_key_prefix = "model."
+    candidates = ["model.diffusion_model.", #ldm/sgm models
+                  "model.model.", #audio models
+                  ]
+    counts = {k: 0 for k in candidates}
+    for k in state_dict:
+        for c in candidates:
+            if k.startswith(c):
+                counts[c] += 1
+                break
+
+    top = max(counts, key=counts.get)
+    if counts[top] > 5:
+        return top
    else:
-        unet_key_prefix = "model.diffusion_model."
-    return unet_key_prefix
+        return "model." #aura flow and others
+

 def convert_config(unet_config):
    new_config = unet_config.copy()
@@ -429,9 +522,15 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
            'transformer_depth': [0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': -2, 'use_linear_in_transformer': False,
            'context_dim': 768, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 1, 1, 1, 1],
            'use_temporal_attention': False, 'use_temporal_resblock': False}
+    
+    SD15_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
+            'dtype': dtype, 'in_channels': 9, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
+            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8,
+            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+            'use_temporal_attention': False, 'use_temporal_resblock': False}  


-    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p]
+    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p, SD15_diffusers_inpaint]

    for unet_config in supported_models:
        matches = True
@@ -450,37 +549,55 @@ def model_config_from_diffusers_unet(state_dict):
    return None

 def convert_diffusers_mmdit(state_dict, output_prefix=""):
-    num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.')
-    if num_blocks > 0:
+    out_sd = {}
+
+    if 'joint_transformer_blocks.0.attn.add_k_proj.weight' in state_dict: #AuraFlow
+        num_joint = count_blocks(state_dict, 'joint_transformer_blocks.{}.')
+        num_single = count_blocks(state_dict, 'single_transformer_blocks.{}.')
+        sd_map = comfy.utils.auraflow_to_diffusers({"n_double_layers": num_joint, "n_layers": num_joint + num_single}, output_prefix=output_prefix)
+    elif 'x_embedder.weight' in state_dict: #Flux
+        depth = count_blocks(state_dict, 'transformer_blocks.{}.')
+        depth_single_blocks = count_blocks(state_dict, 'single_transformer_blocks.{}.')
+        hidden_size = state_dict["x_embedder.bias"].shape[0]
+        sd_map = comfy.utils.flux_to_diffusers({"depth": depth, "depth_single_blocks": depth_single_blocks, "hidden_size": hidden_size}, output_prefix=output_prefix)
+    elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict: #SD3
+        num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.')
        depth = state_dict["pos_embed.proj.weight"].shape[0] // 64
-        out_sd = {}
        sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix)
-        for k in sd_map:
-            weight = state_dict.get(k, None)
-            if weight is not None:
-                t = sd_map[k]
+    else:
+        return None

-                if not isinstance(t, str):
-                    if len(t) > 2:
-                        fun = t[2]
-                    else:
-                        fun = lambda a: a
-                    offset = t[1]
-                    if offset is not None:
-                        old_weight = out_sd.get(t[0], None)
-                        if old_weight is None:
-                            old_weight = torch.empty_like(weight)
-                            old_weight = old_weight.repeat([3] + [1] * (len(old_weight.shape) - 1))
+    for k in sd_map:
+        weight = state_dict.get(k, None)
+        if weight is not None:
+            t = sd_map[k]

-                        w = old_weight.narrow(offset[0], offset[1], offset[2])
-                    else:
-                        old_weight = weight
-                        w = weight
-                    w[:] = fun(weight)
-                    t = t[0]
-                    out_sd[t] = old_weight
+            if not isinstance(t, str):
+                if len(t) > 2:
+                    fun = t[2]
                else:
-                    out_sd[t] = weight
-                state_dict.pop(k)
+                    fun = lambda a: a
+                offset = t[1]
+                if offset is not None:
+                    old_weight = out_sd.get(t[0], None)
+                    if old_weight is None:
+                        old_weight = torch.empty_like(weight)
+                    if old_weight.shape[offset[0]] < offset[1] + offset[2]:
+                        exp = list(weight.shape)
+                        exp[offset[0]] = offset[1] + offset[2]
+                        new = torch.empty(exp, device=weight.device, dtype=weight.dtype)
+                        new[:old_weight.shape[0]] = old_weight
+                        old_weight = new
+
+                    w = old_weight.narrow(offset[0], offset[1], offset[2])
+                else:
+                    old_weight = weight
+                    w = weight
+                w[:] = fun(weight)
+                t = t[0]
+                out_sd[t] = old_weight
+            else:
+                out_sd[t] = weight
+            state_dict.pop(k)

    return out_sd
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1,3 +1,21 @@
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
 import psutil
 import logging
 from enum import Enum
@@ -26,9 +44,15 @@ cpu_state = CPUState.GPU

 total_vram = 0

-lowvram_available = True
 xpu_available = False
+torch_version = ""
+try:
+    torch_version = torch.version.__version__
+    xpu_available = (int(torch_version[0]) < 2 or (int(torch_version[0]) == 2 and int(torch_version[2]) <= 4)) and torch.xpu.is_available()
+except:
+    pass

+lowvram_available = True
 if args.deterministic:
    logging.info("Using deterministic algorithms for pytorch")
    torch.use_deterministic_algorithms(True, warn_only=True)
@@ -48,10 +72,10 @@ if args.directml is not None:

 try:
    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        xpu_available = True
+    _ = torch.xpu.device_count()
+    xpu_available = torch.xpu.is_available()
 except:
-    pass
+    xpu_available = xpu_available or (hasattr(torch, "xpu") and torch.xpu.is_available())

 try:
    if torch.backends.mps.is_available():
@@ -121,7 +145,7 @@ total_ram = psutil.virtual_memory().total / (1024 * 1024)
 logging.info("Total VRAM {:0.0f} MB, total RAM {:0.0f} MB".format(total_vram, total_ram))

 try:
-    logging.info("pytorch version: {}".format(torch.version.__version__))
+    logging.info("pytorch version: {}".format(torch_version))
 except:
    pass

@@ -171,7 +195,6 @@ VAE_DTYPES = [torch.float32]

 try:
    if is_nvidia():
-        torch_version = torch.version.__version__
        if int(torch_version[0]) >= 2:
            if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                ENABLE_PYTORCH_ATTENTION = True
@@ -273,9 +296,12 @@ class LoadedModel:
    def model_memory(self):
        return self.model.model_size()

+    def model_offloaded_memory(self):
+        return self.model.model_size() - self.model.loaded_size()
+
    def model_memory_required(self, device):
-        if device == self.model.current_device:
-            return 0
+        if device == self.model.current_loaded_device():
+            return self.model_offloaded_memory()
        else:
            return self.model_memory()

@@ -287,38 +313,78 @@ class LoadedModel:

        load_weights = not self.weights_loaded

-        try:
-            if lowvram_model_memory > 0 and load_weights:
-                self.real_model = self.model.patch_model_lowvram(device_to=patch_model_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights)
-            else:
-                self.real_model = self.model.patch_model(device_to=patch_model_to, patch_weights=load_weights)
-        except Exception as e:
-            self.model.unpatch_model(self.model.offload_device)
-            self.model_unload()
-            raise e
+        if self.model.loaded_size() > 0:
+            use_more_vram = lowvram_model_memory
+            if use_more_vram == 0:
+                use_more_vram = 1e32
+            self.model_use_more_vram(use_more_vram)
+        else:
+            try:
+                self.real_model = self.model.patch_model(device_to=patch_model_to, lowvram_model_memory=lowvram_model_memory, load_weights=load_weights, force_patch_weights=force_patch_weights)
+            except Exception as e:
+                self.model.unpatch_model(self.model.offload_device)
+                self.model_unload()
+                raise e

-        if is_intel_xpu() and not args.disable_ipex_optimize:
-            self.real_model = ipex.optimize(self.real_model.eval(), graph_mode=True, concat_linear=True)
+        if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and self.real_model is not None:
+            with torch.no_grad():
+                self.real_model = ipex.optimize(self.real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)

        self.weights_loaded = True
        return self.real_model

    def should_reload_model(self, force_patch_weights=False):
-        if force_patch_weights and self.model.lowvram_patch_counter > 0:
+        if force_patch_weights and self.model.lowvram_patch_counter() > 0:
            return True
        return False

-    def model_unload(self, unpatch_weights=True):
+    def model_unload(self, memory_to_free=None, unpatch_weights=True):
+        if memory_to_free is not None:
+            if memory_to_free < self.model.loaded_size():
+                freed = self.model.partially_unload(self.model.offload_device, memory_to_free)
+                if freed >= memory_to_free:
+                    return False
        self.model.unpatch_model(self.model.offload_device, unpatch_weights=unpatch_weights)
        self.model.model_patches_to(self.model.offload_device)
        self.weights_loaded = self.weights_loaded and not unpatch_weights
        self.real_model = None
+        return True
+
+    def model_use_more_vram(self, extra_memory):
+        return self.model.partially_load(self.device, extra_memory)

    def __eq__(self, other):
        return self.model is other.model

+def use_more_memory(extra_memory, loaded_models, device):
+    for m in loaded_models:
+        if m.device == device:
+            extra_memory -= m.model_use_more_vram(extra_memory)
+            if extra_memory <= 0:
+                break
+
+def offloaded_memory(loaded_models, device):
+    offloaded_mem = 0
+    for m in loaded_models:
+        if m.device == device:
+            offloaded_mem += m.model_offloaded_memory()
+    return offloaded_mem
+
+WINDOWS = any(platform.win32_ver())
+
+EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
+if WINDOWS:
+    EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
+
+if args.reserve_vram is not None:
+    EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
+    logging.debug("Reserving {}MB vram for other applications.".format(EXTRA_RESERVED_VRAM / (1024 * 1024)))
+
+def extra_reserved_memory():
+    return EXTRA_RESERVED_VRAM
+
 def minimum_inference_memory():
-    return (1024 * 1024 * 1024)
+    return (1024 * 1024 * 1024) * 0.8 + extra_reserved_memory()

 def unload_model_clones(model, unload_weights_only=True, force_unload=True):
    to_unload = []
@@ -342,6 +408,8 @@ def unload_model_clones(model, unload_weights_only=True, force_unload=True):
    if not force_unload:
        if unload_weights_only and unload_weight == False:
            return None
+    else:
+        unload_weight = True

    for i in to_unload:
        logging.debug("unload clone {} {}".format(i, unload_weight))
@@ -352,24 +420,29 @@ def unload_model_clones(model, unload_weights_only=True, force_unload=True):
 def free_memory(memory_required, device, keep_loaded=[]):
    unloaded_model = []
    can_unload = []
+    unloaded_models = []

    for i in range(len(current_loaded_models) -1, -1, -1):
        shift_model = current_loaded_models[i]
        if shift_model.device == device:
            if shift_model not in keep_loaded:
-                can_unload.append((sys.getrefcount(shift_model.model), shift_model.model_memory(), i))
+                can_unload.append((-shift_model.model_offloaded_memory(), sys.getrefcount(shift_model.model), shift_model.model_memory(), i))
                shift_model.currently_used = False

    for x in sorted(can_unload):
        i = x[-1]
+        memory_to_free = None
        if not DISABLE_SMART_MEMORY:
-            if get_free_memory(device) > memory_required:
+            free_mem = get_free_memory(device)
+            if free_mem > memory_required:
                break
-        current_loaded_models[i].model_unload()
-        unloaded_model.append(i)
+            memory_to_free = memory_required - free_mem
+        logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
+        if current_loaded_models[i].model_unload(memory_to_free):
+            unloaded_model.append(i)

    for i in sorted(unloaded_model, reverse=True):
-        current_loaded_models.pop(i)
+        unloaded_models.append(current_loaded_models.pop(i))

    if len(unloaded_model) > 0:
        soft_empty_cache()
@@ -378,12 +451,17 @@ def free_memory(memory_required, device, keep_loaded=[]):
            mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
            if mem_free_torch > mem_free_total * 0.25:
                soft_empty_cache()
+    return unloaded_models

-def load_models_gpu(models, memory_required=0, force_patch_weights=False):
+def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
    global vram_state

    inference_memory = minimum_inference_memory()
-    extra_mem = max(inference_memory, memory_required)
+    extra_mem = max(inference_memory, memory_required + extra_reserved_memory())
+    if minimum_memory_required is None:
+        minimum_memory_required = extra_mem
+    else:
+        minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory())

    models = set(models)

@@ -416,25 +494,36 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False):
        devs = set(map(lambda a: a.device, models_already_loaded))
        for d in devs:
            if d != torch.device("cpu"):
-                free_memory(extra_mem, d, models_already_loaded)
-        return
+                free_memory(extra_mem + offloaded_memory(models_already_loaded, d), d, models_already_loaded)
+                free_mem = get_free_memory(d)
+                if free_mem < minimum_memory_required:
+                    logging.info("Unloading models for lowram load.") #TODO: partial model unloading when this case happens, also handle the opposite case where models can be unlowvramed.
+                    models_to_load = free_memory(minimum_memory_required, d)
+                    logging.info("{} models unloaded.".format(len(models_to_load)))
+                else:
+                    use_more_memory(free_mem - minimum_memory_required, models_already_loaded, d)
+        if len(models_to_load) == 0:
+            return

    logging.info(f"Loading {len(models_to_load)} new model{'s' if len(models_to_load) > 1 else ''}")

    total_memory_required = {}
    for loaded_model in models_to_load:
-        if unload_model_clones(loaded_model.model, unload_weights_only=True, force_unload=False) == True:#unload clones where the weights are different
-            total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
+        unload_model_clones(loaded_model.model, unload_weights_only=True, force_unload=False) #unload clones where the weights are different
+        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)

-    for device in total_memory_required:
-        if device != torch.device("cpu"):
-            free_memory(total_memory_required[device] * 1.3 + extra_mem, device, models_already_loaded)
+    for loaded_model in models_already_loaded:
+        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)

    for loaded_model in models_to_load:
        weights_unloaded = unload_model_clones(loaded_model.model, unload_weights_only=False, force_unload=False) #unload the rest of the clones where the weights can stay loaded
        if weights_unloaded is not None:
            loaded_model.weights_loaded = not weights_unloaded

+    for device in total_memory_required:
+        if device != torch.device("cpu"):
+            free_memory(total_memory_required[device] * 1.1 + extra_mem, device, models_already_loaded)
+
    for loaded_model in models_to_load:
        model = loaded_model.model
        torch_dev = model.load_device
@@ -443,11 +532,11 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False):
        else:
            vram_set_state = vram_state
        lowvram_model_memory = 0
-        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM):
+        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM) and not force_full_load:
            model_size = loaded_model.model_memory_required(torch_dev)
            current_free_mem = get_free_memory(torch_dev)
-            lowvram_model_memory = int(max(64 * (1024 * 1024), (current_free_mem - 1024 * (1024 * 1024)) / 1.3 ))
-            if model_size <= (current_free_mem - inference_memory): #only switch to lowvram if really necessary
+            lowvram_model_memory = max(64 * (1024 * 1024), (current_free_mem - minimum_memory_required), min(current_free_mem * 0.4, current_free_mem - minimum_inference_memory()))
+            if model_size <= lowvram_model_memory: #only switch to lowvram if really necessary
                lowvram_model_memory = 0

        if vram_set_state == VRAMState.NO_VRAM:
@@ -455,6 +544,14 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False):

        cur_loaded_model = loaded_model.model_load(lowvram_model_memory, force_patch_weights=force_patch_weights)
        current_loaded_models.insert(0, loaded_model)
+
+
+    devs = set(map(lambda a: a.device, models_already_loaded))
+    for d in devs:
+        if d != torch.device("cpu"):
+            free_mem = get_free_memory(d)
+            if free_mem > minimum_memory_required:
+                use_more_memory(free_mem - minimum_memory_required, models_already_loaded, d)
    return


@@ -474,7 +571,9 @@ def loaded_models(only_currently_used=False):
 def cleanup_models(keep_clone_weights_loaded=False):
    to_delete = []
    for i in range(len(current_loaded_models)):
-        if sys.getrefcount(current_loaded_models[i].model) <= 2:
+        #TODO: very fragile function needs improvement
+        num_refs = sys.getrefcount(current_loaded_models[i].model)
+        if num_refs <= 2:
            if not keep_clone_weights_loaded:
                to_delete = [i] + to_delete
            #TODO: find a less fragile way to do this.
@@ -523,7 +622,12 @@ def unet_inital_load_device(parameters, dtype):
    else:
        return cpu_dev

+def maximum_vram_for_weights(device=None):
+    return (get_total_memory(device) * 0.88 - minimum_inference_memory())
+
 def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
+    if model_params < 0:
+        model_params = 1000000000000000000000
    if args.bf16_unet:
        return torch.bfloat16
    if args.fp16_unet:
@@ -532,12 +636,40 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
        return torch.float8_e4m3fn
    if args.fp8_e5m2_unet:
        return torch.float8_e5m2
-    if should_use_fp16(device=device, model_params=model_params, manual_cast=True):
-        if torch.float16 in supported_dtypes:
-            return torch.float16
-    if should_use_bf16(device, model_params=model_params, manual_cast=True):
-        if torch.bfloat16 in supported_dtypes:
-            return torch.bfloat16
+
+    fp8_dtype = None
+    try:
+        for dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+            if dtype in supported_dtypes:
+                fp8_dtype = dtype
+                break
+    except:
+        pass
+
+    if fp8_dtype is not None:
+        if supports_fp8_compute(device): #if fp8 compute is supported the casting is most likely not expensive
+            return fp8_dtype
+
+        free_model_memory = maximum_vram_for_weights(device)
+        if model_params * 2 > free_model_memory:
+            return fp8_dtype
+
+    for dt in supported_dtypes:
+        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params):
+            if torch.float16 in supported_dtypes:
+                return torch.float16
+        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params):
+            if torch.bfloat16 in supported_dtypes:
+                return torch.bfloat16
+
+    for dt in supported_dtypes:
+        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params, manual_cast=True):
+            if torch.float16 in supported_dtypes:
+                return torch.float16
+        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params, manual_cast=True):
+            if torch.bfloat16 in supported_dtypes:
+                return torch.bfloat16
+
    return torch.float32

 # None means no manual cast
@@ -553,13 +685,14 @@ def unet_manual_cast(weight_dtype, inference_device, supported_dtypes=[torch.flo
    if bf16_supported and weight_dtype == torch.bfloat16:
        return None

-    if fp16_supported and torch.float16 in supported_dtypes:
-        return torch.float16
+    fp16_supported = should_use_fp16(inference_device, prioritize_performance=True)
+    for dt in supported_dtypes:
+        if dt == torch.float16 and fp16_supported:
+            return torch.float16
+        if dt == torch.bfloat16 and bf16_supported:
+            return torch.bfloat16

-    elif bf16_supported and torch.bfloat16 in supported_dtypes:
-        return torch.bfloat16
-    else:
-        return torch.float32
+    return torch.float32

 def text_encoder_offload_device():
    if args.gpu_only:
@@ -578,6 +711,20 @@ def text_encoder_device():
    else:
        return torch.device("cpu")

+def text_encoder_initial_device(load_device, offload_device, model_size=0):
+    if load_device == offload_device or model_size <= 1024 * 1024 * 1024:
+        return offload_device
+
+    if is_device_mps(load_device):
+        return offload_device
+
+    mem_l = get_free_memory(load_device)
+    mem_o = get_free_memory(offload_device)
+    if mem_l > (mem_o * 0.5) and model_size * 1.2 < mem_l:
+        return load_device
+    else:
+        return offload_device
+
 def text_encoder_dtype(device=None):
    if args.fp8_e4m3fn_text_enc:
        return torch.float8_e4m3fn
@@ -649,18 +796,29 @@ def supports_cast(device, dtype): #TODO
        return True
    if dtype == torch.float16:
        return True
-    if is_device_mps(device):
-        return False
    if directml_enabled: #TODO: test this
        return False
    if dtype == torch.bfloat16:
        return True
+    if is_device_mps(device):
+        return False
    if dtype == torch.float8_e4m3fn:
        return True
    if dtype == torch.float8_e5m2:
        return True
    return False

+def pick_weight_dtype(dtype, fallback_dtype, device=None):
+    if dtype is None:
+        dtype = fallback_dtype
+    elif dtype_size(dtype) > dtype_size(fallback_dtype):
+        dtype = fallback_dtype
+
+    if not supports_cast(device, dtype):
+        dtype = fallback_dtype
+
+    return dtype
+
 def device_supports_non_blocking(device):
    if is_device_mps(device):
        return False #pytorch bug? mps doesn't support non blocking
@@ -685,27 +843,21 @@ def force_channels_last():
    #TODO
    return False

+def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False):
+    if device is None or weight.device == device:
+        if not copy:
+            if dtype is None or weight.dtype == dtype:
+                return weight
+        return weight.to(dtype=dtype, copy=copy)
+
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight, non_blocking=non_blocking)
+    return r
+
 def cast_to_device(tensor, device, dtype, copy=False):
-    device_supports_cast = False
-    if tensor.dtype == torch.float32 or tensor.dtype == torch.float16:
-        device_supports_cast = True
-    elif tensor.dtype == torch.bfloat16:
-        if hasattr(device, 'type') and device.type.startswith("cuda"):
-            device_supports_cast = True
-        elif is_intel_xpu():
-            device_supports_cast = True
+    non_blocking = device_supports_non_blocking(device)
+    return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy)

-    non_blocking = device_should_use_non_blocking(device)
-
-    if device_supports_cast:
-        if copy:
-            if tensor.device == device:
-                return tensor.to(dtype, copy=copy, non_blocking=non_blocking)
-            return tensor.to(device, copy=copy, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
-        else:
-            return tensor.to(device, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
-    else:
-        return tensor.to(device, dtype, copy=copy, non_blocking=non_blocking)

 def xformers_enabled():
    global directml_enabled
@@ -743,7 +895,8 @@ def pytorch_attention_flash_attention():
 def force_upcast_attention_dtype():
    upcast = args.force_upcast_attention
    try:
-        if platform.mac_ver()[0] in ['14.5']: #black image bug on OSX Sonoma 14.5
+        macos_version = tuple(int(n) for n in platform.mac_ver()[0].split("."))
+        if (14, 5) <= macos_version <= (15, 2):  # black image bug on recent versions of macOS
            upcast = True
    except:
        pass
@@ -839,24 +992,24 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
    if torch.version.hip:
        return True

-    props = torch.cuda.get_device_properties("cuda")
+    props = torch.cuda.get_device_properties(device)
    if props.major >= 8:
        return True

    if props.major < 6:
        return False

-    fp16_works = False
-    #FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled
-    #when the model doesn't actually fit on the card
-    #TODO: actually test if GP106 and others have the same type of behavior
+    #FP16 is confirmed working on a 1080 (GP104) and on latest pytorch actually seems faster than fp32
    nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050", "p40", "p100", "p6", "p4"]
    for x in nvidia_10_series:
        if x in props.name.lower():
-            fp16_works = True
+            if WINDOWS or manual_cast:
+                return True
+            else:
+                return False #weird linux behavior where fp32 is faster

-    if fp16_works or manual_cast:
-        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
+    if manual_cast:
+        free_model_memory = maximum_vram_for_weights(device)
        if (not prioritize_performance) or model_params * 4 > free_model_memory:
            return True

@@ -876,9 +1029,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
        if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
            return False

-    if device is not None: #TODO not sure about mps bf16 support
+    if device is not None:
        if is_device_mps(device):
-            return False
+            return True

    if FORCE_FP32:
        return False
@@ -886,15 +1039,15 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    if directml_enabled:
        return False

-    if cpu_mode() or mps_mode():
+    if mps_mode():
+        return True
+
+    if cpu_mode():
        return False

    if is_intel_xpu():
        return True

-    if device is None:
-        device = torch.device("cuda")
-
    props = torch.cuda.get_device_properties(device)
    if props.major >= 8:
        return True
@@ -902,12 +1055,33 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    bf16_works = torch.cuda.is_bf16_supported()

    if bf16_works or manual_cast:
-        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
+        free_model_memory = maximum_vram_for_weights(device)
        if (not prioritize_performance) or model_params * 4 > free_model_memory:
            return True

    return False

+def supports_fp8_compute(device=None):
+    if not is_nvidia():
+        return False
+
+    props = torch.cuda.get_device_properties(device)
+    if props.major >= 9:
+        return True
+    if props.major < 8:
+        return False
+    if props.minor < 9:
+        return False
+
+    if int(torch_version[0]) < 2 or (int(torch_version[0]) == 2 and int(torch_version[2]) < 3):
+        return False
+
+    if WINDOWS:
+        if (int(torch_version[0]) == 2 and int(torch_version[2]) < 4):
+            return False
+
+    return True
+
 def soft_empty_cache(force=False):
    global cpu_state
    if cpu_state == CPUState.MPS:
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1,34 +1,47 @@
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
 import torch
 import copy
 import inspect
 import logging
 import uuid
+import collections
+import math

 import comfy.utils
+import comfy.float
 import comfy.model_management
-from comfy.types import UnetWrapperFunction
-
-
-def weight_decompose(dora_scale, weight, lora_diff, alpha, strength):
-    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, torch.float32)
-    lora_diff *= alpha
-    weight_calc = weight + lora_diff.type(weight.dtype)
-    weight_norm = (
-        weight_calc.transpose(0, 1)
-        .reshape(weight_calc.shape[1], -1)
-        .norm(dim=1, keepdim=True)
-        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
-        .transpose(0, 1)
-    )
-
-    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
-    if strength != 1.0:
-        weight_calc -= weight
-        weight += strength * (weight_calc)
-    else:
-        weight[:] = weight_calc
-    return weight
+import comfy.lora
+from comfy.comfy_types import UnetWrapperFunction

+def string_to_seed(data):
+    crc = 0xFFFFFFFF
+    for byte in data:
+        if isinstance(byte, str):
+            byte = ord(byte)
+        crc ^= byte
+        for _ in range(8):
+            if crc & 1:
+                crc = (crc >> 1) ^ 0xEDB88320
+            else:
+                crc >>= 1
+    return crc ^ 0xFFFFFFFF

 def set_model_options_patch_replace(model_options, patch, name, block_name, number, transformer_index=None):
    to = model_options["transformer_options"].copy()
@@ -63,10 +76,59 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_
        model_options["disable_cfg1_optimization"] = True
    return model_options

+def wipe_lowvram_weight(m):
+    if hasattr(m, "prev_comfy_cast_weights"):
+        m.comfy_cast_weights = m.prev_comfy_cast_weights
+        del m.prev_comfy_cast_weights
+    m.weight_function = None
+    m.bias_function = None
+
+class LowVramPatch:
+    def __init__(self, key, patches):
+        self.key = key
+        self.patches = patches
+    def __call__(self, weight):
+        intermediate_dtype = weight.dtype
+        if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops
+            intermediate_dtype = torch.float32
+            return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key))
+
+        return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype)
+
+def get_key_weight(model, key):
+    set_func = None
+    convert_func = None
+    op_keys = key.rsplit('.', 1)
+    if len(op_keys) < 2:
+        weight = comfy.utils.get_attr(model, key)
+    else:
+        op = comfy.utils.get_attr(model, op_keys[0])
+        try:
+            set_func = getattr(op, "set_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+
+        try:
+            convert_func = getattr(op, "convert_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+
+        weight = getattr(op, op_keys[1])
+        if convert_func is not None:
+            weight = comfy.utils.get_attr(model, key)
+
+    return weight, set_func, convert_func
+
 class ModelPatcher:
-    def __init__(self, model, load_device, offload_device, size=0, current_device=None, weight_inplace_update=False):
+    def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
        self.size = size
        self.model = model
+        if not hasattr(self.model, 'device'):
+            logging.debug("Model doesn't have a device attribute.")
+            self.model.device = offload_device
+        elif self.model.device is None:
+            self.model.device = offload_device
+
        self.patches = {}
        self.backup = {}
        self.object_patches = {}
@@ -75,24 +137,32 @@ class ModelPatcher:
        self.model_size()
        self.load_device = load_device
        self.offload_device = offload_device
-        if current_device is None:
-            self.current_device = self.offload_device
-        else:
-            self.current_device = current_device
-
        self.weight_inplace_update = weight_inplace_update
-        self.model_lowvram = False
-        self.lowvram_patch_counter = 0
        self.patches_uuid = uuid.uuid4()

+        if not hasattr(self.model, 'model_loaded_weight_memory'):
+            self.model.model_loaded_weight_memory = 0
+
+        if not hasattr(self.model, 'lowvram_patch_counter'):
+            self.model.lowvram_patch_counter = 0
+
+        if not hasattr(self.model, 'model_lowvram'):
+            self.model.model_lowvram = False
+
    def model_size(self):
        if self.size > 0:
            return self.size
        self.size = comfy.model_management.module_size(self.model)
        return self.size

+    def loaded_size(self):
+        return self.model.model_loaded_weight_memory
+
+    def lowvram_patch_counter(self):
+        return self.model.lowvram_patch_counter
+
    def clone(self):
-        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, weight_inplace_update=self.weight_inplace_update)
+        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
        n.patches = {}
        for k in self.patches:
            n.patches[k] = self.patches[k][:]
@@ -242,17 +312,23 @@ class ModelPatcher:
        return list(p)

    def get_key_patches(self, filter_prefix=None):
-        comfy.model_management.unload_model_clones(self)
        model_sd = self.model_state_dict()
        p = {}
        for k in model_sd:
            if filter_prefix is not None:
                if not k.startswith(filter_prefix):
                    continue
+            bk = self.backup.get(k, None)
+            weight, set_func, convert_func = get_key_weight(self.model, k)
+            if bk is not None:
+                weight = bk.weight
+            if convert_func is None:
+                convert_func = lambda a, **kwargs: a
+
            if k in self.patches:
-                p[k] = [model_sd[k]] + self.patches[k]
+                p[k] = [(weight, convert_func)] + self.patches[k]
            else:
-                p[k] = (model_sd[k],)
+                p[k] = [(weight, convert_func)]
        return p

    def model_state_dict(self, filter_prefix=None):
@@ -264,67 +340,61 @@ class ModelPatcher:
                    sd.pop(k)
        return sd

-    def patch_weight_to_device(self, key, device_to=None):
+    def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
        if key not in self.patches:
            return

-        weight = comfy.utils.get_attr(self.model, key)
-
-        inplace_update = self.weight_inplace_update
+        weight, set_func, convert_func = get_key_weight(self.model, key)
+        inplace_update = self.weight_inplace_update or inplace_update

        if key not in self.backup:
-            self.backup[key] = weight.to(device=self.offload_device, copy=inplace_update)
+            self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)

        if device_to is not None:
            temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
        else:
            temp_weight = weight.to(torch.float32, copy=True)
-        out_weight = self.calculate_weight(self.patches[key], temp_weight, key).to(weight.dtype)
-        if inplace_update:
-            comfy.utils.copy_to_param(self.model, key, out_weight)
+        if convert_func is not None:
+            temp_weight = convert_func(temp_weight, inplace=True)
+
+        out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
+        if set_func is None:
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
+            if inplace_update:
+                comfy.utils.copy_to_param(self.model, key, out_weight)
+            else:
+                comfy.utils.set_attr_param(self.model, key, out_weight)
        else:
-            comfy.utils.set_attr_param(self.model, key, out_weight)
-
-    def patch_model(self, device_to=None, patch_weights=True):
-        for k in self.object_patches:
-            old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
-            if k not in self.object_patches_backup:
-                self.object_patches_backup[k] = old
-
-        if patch_weights:
-            model_sd = self.model_state_dict()
-            for key in self.patches:
-                if key not in model_sd:
-                    logging.warning("could not patch. key doesn't exist in model: {}".format(key))
-                    continue
-
-                self.patch_weight_to_device(key, device_to)
-
-            if device_to is not None:
-                self.model.to(device_to)
-                self.current_device = device_to
-
-        return self.model
-
-    def patch_model_lowvram(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False):
-        self.patch_model(device_to, patch_weights=False)
-
-        logging.info("loading in lowvram mode {}".format(lowvram_model_memory/(1024 * 1024)))
-        class LowVramPatch:
-            def __init__(self, key, model_patcher):
-                self.key = key
-                self.model_patcher = model_patcher
-            def __call__(self, weight):
-                return self.model_patcher.calculate_weight(self.model_patcher.patches[self.key], weight, self.key)
+            set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key))

+    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
        mem_counter = 0
        patch_counter = 0
+        lowvram_counter = 0
+        loading = []
        for n, m in self.model.named_modules():
+            params = []
+            for name, param in m.named_parameters(recurse=False):
+                params.append(name)
+            if hasattr(m, "comfy_cast_weights") or len(params) > 0:
+                loading.append((comfy.model_management.module_size(m), n, m, params))
+
+        load_completely = []
+        loading.sort(reverse=True)
+        for x in loading:
+            n = x[1]
+            m = x[2]
+            params = x[3]
+            module_mem = x[0]
+
            lowvram_weight = False
-            if hasattr(m, "comfy_cast_weights"):
-                module_mem = comfy.model_management.module_size(m)
+
+            if not full_load and hasattr(m, "comfy_cast_weights"):
                if mem_counter + module_mem >= lowvram_model_memory:
                    lowvram_weight = True
+                    lowvram_counter += 1
+                    if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed
+                        continue

            weight_key = "{}.weight".format(n)
            bias_key = "{}.bias".format(n)
@@ -334,227 +404,172 @@ class ModelPatcher:
                    if force_patch_weights:
                        self.patch_weight_to_device(weight_key)
                    else:
-                        m.weight_function = LowVramPatch(weight_key, self)
+                        m.weight_function = LowVramPatch(weight_key, self.patches)
                        patch_counter += 1
                if bias_key in self.patches:
                    if force_patch_weights:
                        self.patch_weight_to_device(bias_key)
                    else:
-                        m.bias_function = LowVramPatch(bias_key, self)
+                        m.bias_function = LowVramPatch(bias_key, self.patches)
                        patch_counter += 1

                m.prev_comfy_cast_weights = m.comfy_cast_weights
                m.comfy_cast_weights = True
            else:
-                if hasattr(m, "weight"):
-                    self.patch_weight_to_device(weight_key, device_to)
-                    self.patch_weight_to_device(bias_key, device_to)
-                    m.to(device_to)
-                    mem_counter += comfy.model_management.module_size(m)
-                    logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
+                if hasattr(m, "comfy_cast_weights"):
+                    if m.comfy_cast_weights:
+                        wipe_lowvram_weight(m)

-        self.model_lowvram = True
-        self.lowvram_patch_counter = patch_counter
+                mem_counter += module_mem
+                load_completely.append((module_mem, n, m, params))
+
+        load_completely.sort(reverse=True)
+        for x in load_completely:
+            n = x[1]
+            m = x[2]
+            params = x[3]
+            if hasattr(m, "comfy_patched_weights"):
+                if m.comfy_patched_weights == True:
+                    continue
+
+            for param in params:
+                self.patch_weight_to_device("{}.{}".format(n, param), device_to=device_to)
+
+            logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
+            m.comfy_patched_weights = True
+
+        for x in load_completely:
+            x[2].to(device_to)
+
+        if lowvram_counter > 0:
+            logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter))
+            self.model.model_lowvram = True
+        else:
+            logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
+            self.model.model_lowvram = False
+            if full_load:
+                self.model.to(device_to)
+                mem_counter = self.model_size()
+
+        self.model.lowvram_patch_counter += patch_counter
+        self.model.device = device_to
+        self.model.model_loaded_weight_memory = mem_counter
+
+    def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
+        for k in self.object_patches:
+            old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
+            if k not in self.object_patches_backup:
+                self.object_patches_backup[k] = old
+
+        if lowvram_model_memory == 0:
+            full_load = True
+        else:
+            full_load = False
+
+        if load_weights:
+            self.load(device_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights, full_load=full_load)
        return self.model

-    def calculate_weight(self, patches, weight, key):
-        for p in patches:
-            strength = p[0]
-            v = p[1]
-            strength_model = p[2]
-            offset = p[3]
-            function = p[4]
-            if function is None:
-                function = lambda a: a
-
-            old_weight = None
-            if offset is not None:
-                old_weight = weight
-                weight = weight.narrow(offset[0], offset[1], offset[2])
-
-            if strength_model != 1.0:
-                weight *= strength_model
-
-            if isinstance(v, list):
-                v = (self.calculate_weight(v[1:], v[0].clone(), key), )
-
-            if len(v) == 1:
-                patch_type = "diff"
-            elif len(v) == 2:
-                patch_type = v[0]
-                v = v[1]
-
-            if patch_type == "diff":
-                w1 = v[0]
-                if strength != 0.0:
-                    if w1.shape != weight.shape:
-                        logging.warning("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, w1.shape, weight.shape))
-                    else:
-                        weight += function(strength * comfy.model_management.cast_to_device(w1, weight.device, weight.dtype))
-            elif patch_type == "lora": #lora/locon
-                mat1 = comfy.model_management.cast_to_device(v[0], weight.device, torch.float32)
-                mat2 = comfy.model_management.cast_to_device(v[1], weight.device, torch.float32)
-                dora_scale = v[4]
-                if v[2] is not None:
-                    alpha = v[2] / mat2.shape[0]
-                else:
-                    alpha = 1.0
-
-                if v[3] is not None:
-                    #locon mid weights, hopefully the math is fine because I didn't properly test it
-                    mat3 = comfy.model_management.cast_to_device(v[3], weight.device, torch.float32)
-                    final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
-                    mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
-                try:
-                    lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
-                    if dora_scale is not None:
-                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
-                    else:
-                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-                except Exception as e:
-                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
-            elif patch_type == "lokr":
-                w1 = v[0]
-                w2 = v[1]
-                w1_a = v[3]
-                w1_b = v[4]
-                w2_a = v[5]
-                w2_b = v[6]
-                t2 = v[7]
-                dora_scale = v[8]
-                dim = None
-
-                if w1 is None:
-                    dim = w1_b.shape[0]
-                    w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, torch.float32),
-                                  comfy.model_management.cast_to_device(w1_b, weight.device, torch.float32))
-                else:
-                    w1 = comfy.model_management.cast_to_device(w1, weight.device, torch.float32)
-
-                if w2 is None:
-                    dim = w2_b.shape[0]
-                    if t2 is None:
-                        w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, torch.float32),
-                                      comfy.model_management.cast_to_device(w2_b, weight.device, torch.float32))
-                    else:
-                        w2 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                          comfy.model_management.cast_to_device(t2, weight.device, torch.float32),
-                                          comfy.model_management.cast_to_device(w2_b, weight.device, torch.float32),
-                                          comfy.model_management.cast_to_device(w2_a, weight.device, torch.float32))
-                else:
-                    w2 = comfy.model_management.cast_to_device(w2, weight.device, torch.float32)
-
-                if len(w2.shape) == 4:
-                    w1 = w1.unsqueeze(2).unsqueeze(2)
-                if v[2] is not None and dim is not None:
-                    alpha = v[2] / dim
-                else:
-                    alpha = 1.0
-
-                try:
-                    lora_diff = torch.kron(w1, w2).reshape(weight.shape)
-                    if dora_scale is not None:
-                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
-                    else:
-                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-                except Exception as e:
-                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
-            elif patch_type == "loha":
-                w1a = v[0]
-                w1b = v[1]
-                if v[2] is not None:
-                    alpha = v[2] / w1b.shape[0]
-                else:
-                    alpha = 1.0
-
-                w2a = v[3]
-                w2b = v[4]
-                dora_scale = v[7]
-                if v[5] is not None: #cp decomposition
-                    t1 = v[5]
-                    t2 = v[6]
-                    m1 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                      comfy.model_management.cast_to_device(t1, weight.device, torch.float32),
-                                      comfy.model_management.cast_to_device(w1b, weight.device, torch.float32),
-                                      comfy.model_management.cast_to_device(w1a, weight.device, torch.float32))
-
-                    m2 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                      comfy.model_management.cast_to_device(t2, weight.device, torch.float32),
-                                      comfy.model_management.cast_to_device(w2b, weight.device, torch.float32),
-                                      comfy.model_management.cast_to_device(w2a, weight.device, torch.float32))
-                else:
-                    m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, torch.float32),
-                                  comfy.model_management.cast_to_device(w1b, weight.device, torch.float32))
-                    m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, torch.float32),
-                                  comfy.model_management.cast_to_device(w2b, weight.device, torch.float32))
-
-                try:
-                    lora_diff = (m1 * m2).reshape(weight.shape)
-                    if dora_scale is not None:
-                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
-                    else:
-                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-                except Exception as e:
-                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
-            elif patch_type == "glora":
-                if v[4] is not None:
-                    alpha = v[4] / v[0].shape[0]
-                else:
-                    alpha = 1.0
-
-                dora_scale = v[5]
-
-                a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, torch.float32)
-                a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, torch.float32)
-                b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, torch.float32)
-                b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, torch.float32)
-
-                try:
-                    lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1), a2), a1)).reshape(weight.shape)
-                    if dora_scale is not None:
-                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
-                    else:
-                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-                except Exception as e:
-                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
-            else:
-                logging.warning("patch type not recognized {} {}".format(patch_type, key))
-
-            if old_weight is not None:
-                weight = old_weight
-
-        return weight
-
    def unpatch_model(self, device_to=None, unpatch_weights=True):
        if unpatch_weights:
-            if self.model_lowvram:
+            if self.model.model_lowvram:
                for m in self.model.modules():
-                    if hasattr(m, "prev_comfy_cast_weights"):
-                        m.comfy_cast_weights = m.prev_comfy_cast_weights
-                        del m.prev_comfy_cast_weights
-                    m.weight_function = None
-                    m.bias_function = None
+                    wipe_lowvram_weight(m)

-                self.model_lowvram = False
-                self.lowvram_patch_counter = 0
+                self.model.model_lowvram = False
+                self.model.lowvram_patch_counter = 0

            keys = list(self.backup.keys())

-            if self.weight_inplace_update:
-                for k in keys:
-                    comfy.utils.copy_to_param(self.model, k, self.backup[k])
-            else:
-                for k in keys:
-                    comfy.utils.set_attr_param(self.model, k, self.backup[k])
+            for k in keys:
+                bk = self.backup[k]
+                if bk.inplace_update:
+                    comfy.utils.copy_to_param(self.model, k, bk.weight)
+                else:
+                    comfy.utils.set_attr_param(self.model, k, bk.weight)

            self.backup.clear()

            if device_to is not None:
                self.model.to(device_to)
-                self.current_device = device_to
+                self.model.device = device_to
+            self.model.model_loaded_weight_memory = 0
+
+            for m in self.model.modules():
+                if hasattr(m, "comfy_patched_weights"):
+                    del m.comfy_patched_weights

        keys = list(self.object_patches_backup.keys())
        for k in keys:
            comfy.utils.set_attr(self.model, k, self.object_patches_backup[k])

        self.object_patches_backup.clear()
+
+    def partially_unload(self, device_to, memory_to_free=0):
+        memory_freed = 0
+        patch_counter = 0
+        unload_list = []
+
+        for n, m in self.model.named_modules():
+            shift_lowvram = False
+            if hasattr(m, "comfy_cast_weights"):
+                module_mem = comfy.model_management.module_size(m)
+                unload_list.append((module_mem, n, m))
+
+        unload_list.sort()
+        for unload in unload_list:
+            if memory_to_free < memory_freed:
+                break
+            module_mem = unload[0]
+            n = unload[1]
+            m = unload[2]
+            weight_key = "{}.weight".format(n)
+            bias_key = "{}.bias".format(n)
+
+            if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
+                for key in [weight_key, bias_key]:
+                    bk = self.backup.get(key, None)
+                    if bk is not None:
+                        if bk.inplace_update:
+                            comfy.utils.copy_to_param(self.model, key, bk.weight)
+                        else:
+                            comfy.utils.set_attr_param(self.model, key, bk.weight)
+                        self.backup.pop(key)
+
+                m.to(device_to)
+                if weight_key in self.patches:
+                    m.weight_function = LowVramPatch(weight_key, self.patches)
+                    patch_counter += 1
+                if bias_key in self.patches:
+                    m.bias_function = LowVramPatch(bias_key, self.patches)
+                    patch_counter += 1
+
+                m.prev_comfy_cast_weights = m.comfy_cast_weights
+                m.comfy_cast_weights = True
+                m.comfy_patched_weights = False
+                memory_freed += module_mem
+                logging.debug("freed {}".format(n))
+
+        self.model.model_lowvram = True
+        self.model.lowvram_patch_counter += patch_counter
+        self.model.model_loaded_weight_memory -= memory_freed
+        return memory_freed
+
+    def partially_load(self, device_to, extra_memory=0):
+        self.unpatch_model(unpatch_weights=False)
+        self.patch_model(load_weights=False)
+        full_load = False
+        if self.model.model_lowvram == False:
+            return 0
+        if self.model.model_loaded_weight_memory + extra_memory > self.model_size():
+            full_load = True
+        current_used = self.model.model_loaded_weight_memory
+        self.load(device_to, lowvram_model_memory=current_used + extra_memory, full_load=full_load)
+        return self.model.model_loaded_weight_memory - current_used
+
+    def current_loaded_device(self):
+        return self.model.device
+
+    def calculate_weight(self, patches, weight, key, intermediate_dtype=torch.float32):
+        print("WARNING the ModelPatcher.calculate_weight function is deprecated, please use: comfy.lora.calculate_weight instead")
+        return comfy.lora.calculate_weight(patches, weight, key, intermediate_dtype=intermediate_dtype)
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@@ -2,6 +2,25 @@ import torch
 from comfy.ldm.modules.diffusionmodules.util import make_beta_schedule
 import math

+def rescale_zero_terminal_snr_sigmas(sigmas):
+    alphas_cumprod = 1 / ((sigmas * sigmas) + 1)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= (alphas_bar_sqrt_T)
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas_bar[-1] = 4.8973451890853435e-08
+    return ((1 - alphas_bar) / alphas_bar) ** 0.5
+
 class EPS:
    def calculate_input(self, sigma, noise):
        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
@@ -48,7 +67,7 @@ class CONST:
        return latent / (1.0 - sigma)

 class ModelSamplingDiscrete(torch.nn.Module):
-    def __init__(self, model_config=None):
+    def __init__(self, model_config=None, zsnr=None):
        super().__init__()

        if model_config is not None:
@@ -59,12 +78,16 @@ class ModelSamplingDiscrete(torch.nn.Module):
        beta_schedule = sampling_settings.get("beta_schedule", "linear")
        linear_start = sampling_settings.get("linear_start", 0.00085)
        linear_end = sampling_settings.get("linear_end", 0.012)
+        timesteps = sampling_settings.get("timesteps", 1000)

-        self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=1000, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3)
+        if zsnr is None:
+            zsnr = sampling_settings.get("zsnr", False)
+
+        self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3, zsnr=zsnr)
        self.sigma_data = 1.0

    def _register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
-                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3, zsnr=False):
        if given_betas is not None:
            betas = given_betas
        else:
@@ -82,6 +105,9 @@ class ModelSamplingDiscrete(torch.nn.Module):
        # self.register_buffer('alphas_cumprod_prev', torch.tensor(alphas_cumprod_prev, dtype=torch.float32))

        sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        if zsnr:
+            sigmas = rescale_zero_terminal_snr_sigmas(sigmas)
+
        self.set_sigmas(sigmas)

    def set_sigmas(self, sigmas):
@@ -271,3 +297,43 @@ class StableCascadeSampling(ModelSamplingDiscrete):

        percent = 1.0 - percent
        return self.sigma(torch.tensor(percent))
+
+
+def flux_time_shift(mu: float, sigma: float, t):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+class ModelSamplingFlux(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+
+        self.set_parameters(shift=sampling_settings.get("shift", 1.15))
+
+    def set_parameters(self, shift=1.15, timesteps=10000):
+        self.shift = shift
+        ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps))
+        self.register_buffer('sigmas', ts)
+
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+
+    def timestep(self, sigma):
+        return sigma
+
+    def sigma(self, timestep):
+        return flux_time_shift(self.shift, 1.0, timestep)
+
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 1.0
+        if percent >= 1.0:
+            return 0.0
+        return 1.0 - percent
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -18,16 +18,34 @@

 import torch
 import comfy.model_management
+from comfy.cli_args import args
+import comfy.float
+
+cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+
+def cast_to_input(weight, input, non_blocking=False, copy=True):
+    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
+
+def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
+    if input is not None:
+        if dtype is None:
+            dtype = input.dtype
+        if bias_dtype is None:
+            bias_dtype = dtype
+        if device is None:
+            device = input.device

-def cast_bias_weight(s, input):
    bias = None
-    non_blocking = comfy.model_management.device_should_use_non_blocking(input.device)
+    non_blocking = comfy.model_management.device_supports_non_blocking(device)
    if s.bias is not None:
-        bias = s.bias.to(device=input.device, dtype=input.dtype, non_blocking=non_blocking)
-        if s.bias_function is not None:
+        has_function = s.bias_function is not None
+        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function)
+        if has_function:
            bias = s.bias_function(bias)
-    weight = s.weight.to(device=input.device, dtype=input.dtype, non_blocking=non_blocking)
-    if s.weight_function is not None:
+
+    has_function = s.weight_function is not None
+    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function)
+    if has_function:
        weight = s.weight_function(weight)
    return weight, bias

@@ -168,6 +186,26 @@ class disable_weight_init:
            else:
                return super().forward(*args, **kwargs)

+    class Embedding(torch.nn.Embedding, CastWeightBiasOp):
+        def reset_parameters(self):
+            self.bias = None
+            return None
+
+        def forward_comfy_cast_weights(self, input, out_dtype=None):
+            output_dtype = out_dtype
+            if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16:
+                out_dtype = None
+            weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype)
+            return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                if "out_dtype" in kwargs:
+                    kwargs.pop("out_dtype")
+                return super().forward(*args, **kwargs)
+
    @classmethod
    def conv_nd(s, dims, *args, **kwargs):
        if dims == 2:
@@ -202,3 +240,127 @@ class manual_cast(disable_weight_init):

    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
        comfy_cast_weights = True
+
+    class Embedding(disable_weight_init.Embedding):
+        comfy_cast_weights = True
+
+
+def fp8_linear(self, input):
+    dtype = self.weight.dtype
+    if dtype not in [torch.float8_e4m3fn]:
+        return None
+
+    tensor_2d = False
+    if len(input.shape) == 2:
+        tensor_2d = True
+        input = input.unsqueeze(1)
+
+
+    if len(input.shape) == 3:
+        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input.dtype)
+        w = w.t()
+
+        scale_weight = self.scale_weight
+        scale_input = self.scale_input
+        if scale_weight is None:
+            scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
+        else:
+            scale_weight = scale_weight.to(input.device)
+
+        if scale_input is None:
+            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
+            inn = input.reshape(-1, input.shape[2]).to(dtype)
+        else:
+            scale_input = scale_input.to(input.device)
+            inn = (input * (1.0 / scale_input).to(input.dtype)).reshape(-1, input.shape[2]).to(dtype)
+
+        if bias is not None:
+            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
+        else:
+            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, scale_a=scale_input, scale_b=scale_weight)
+
+        if isinstance(o, tuple):
+            o = o[0]
+
+        if tensor_2d:
+            return o.reshape(input.shape[0], -1)
+
+        return o.reshape((-1, input.shape[1], self.weight.shape[0]))
+
+    return None
+
+class fp8_ops(manual_cast):
+    class Linear(manual_cast.Linear):
+        def reset_parameters(self):
+            self.scale_weight = None
+            self.scale_input = None
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            out = fp8_linear(self, input)
+            if out is not None:
+                return out
+
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.linear(input, weight, bias)
+
+def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
+    class scaled_fp8_op(manual_cast):
+        class Linear(manual_cast.Linear):
+            def __init__(self, *args, **kwargs):
+                if override_dtype is not None:
+                    kwargs['dtype'] = override_dtype
+                super().__init__(*args, **kwargs)
+
+            def reset_parameters(self):
+                if not hasattr(self, 'scale_weight'):
+                    self.scale_weight = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
+
+                if not scale_input:
+                    self.scale_input = None
+
+                if not hasattr(self, 'scale_input'):
+                    self.scale_input = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
+                return None
+
+            def forward_comfy_cast_weights(self, input):
+                if fp8_matrix_mult:
+                    out = fp8_linear(self, input)
+                    if out is not None:
+                        return out
+
+                weight, bias = cast_bias_weight(self, input)
+
+                if weight.numel() < input.numel(): #TODO: optimize
+                    return torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias)
+                else:
+                    return torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias)
+
+            def convert_weight(self, weight, inplace=False, **kwargs):
+                if inplace:
+                    weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype)
+                    return weight
+                else:
+                    return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype)
+
+            def set_weight(self, weight, inplace_update=False, seed=None, **kwargs):
+                weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed)
+                if inplace_update:
+                    self.weight.data.copy_(weight)
+                else:
+                    self.weight = torch.nn.Parameter(weight, requires_grad=False)
+
+    return scaled_fp8_op
+
+def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None):
+    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
+    if scaled_fp8 is not None:
+        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute, scale_input=True, override_dtype=scaled_fp8)
+
+    if fp8_compute and (fp8_optimizations or args.fast) and not disable_fast_fp8:
+        return fp8_ops
+
+    if compute_dtype is None or weight_dtype == compute_dtype:
+        return disable_weight_init
+
+    return manual_cast
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -1,14 +1,10 @@
 import torch
 import comfy.model_management
 import comfy.conds
+import comfy.utils

 def prepare_mask(noise_mask, shape, device):
-    """ensures noise mask is of proper dimensions"""
-    noise_mask = torch.nn.functional.interpolate(noise_mask.reshape((-1, 1, noise_mask.shape[-2], noise_mask.shape[-1])), size=(shape[2], shape[3]), mode="bilinear")
-    noise_mask = torch.cat([noise_mask] * shape[1], dim=1)
-    noise_mask = comfy.utils.repeat_to_batch_size(noise_mask, shape[0])
-    noise_mask = noise_mask.to(device)
-    return noise_mask
+    return comfy.utils.reshape_mask(noise_mask, shape).to(device)

 def get_models_from_cond(cond, model_type):
    models = []
@@ -61,7 +57,9 @@ def prepare_sampling(model, noise_shape, conds):
    device = model.load_device
    real_model = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
-    comfy.model_management.load_models_gpu([model] + models, model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory)
+    memory_required = model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory
+    minimum_memory_required = model.memory_required([noise_shape[0]] + list(noise_shape[1:])) + inference_memory
+    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required)
    real_model = model.model

    return real_model, conds, models
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -6,6 +6,8 @@ from comfy import model_management
 import math
 import logging
 import comfy.sampler_helpers
+import scipy.stats
+import numpy

 def get_area_and_mult(conds, x_in, timestep_in):
    dims = tuple(x_in.shape[2:])
@@ -169,7 +171,7 @@ def calc_cond_batch(model, conds, x_in, timestep, model_options):
        for i in range(1, len(to_batch_temp) + 1):
            batch_amount = to_batch_temp[:len(to_batch_temp)//i]
            input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
-            if model.memory_required(input_shape) < free_memory:
+            if model.memory_required(input_shape) * 1.5 < free_memory:
                to_batch = batch_amount
                break

@@ -311,13 +313,18 @@ def simple_scheduler(model_sampling, steps):
 def ddim_scheduler(model_sampling, steps):
    s = model_sampling
    sigs = []
-    ss = max(len(s.sigmas) // steps, 1)
    x = 1
+    if math.isclose(float(s.sigmas[x]), 0, abs_tol=0.00001):
+        steps += 1
+        sigs = []
+    else:
+        sigs = [0.0]
+
+    ss = max(len(s.sigmas) // steps, 1)
    while x < len(s.sigmas):
        sigs += [float(s.sigmas[x])]
        x += ss
    sigs = sigs[::-1]
-    sigs += [0.0]
    return torch.FloatTensor(sigs)

 def normal_scheduler(model_sampling, steps, sgm=False, floor=False):
@@ -325,18 +332,61 @@ def normal_scheduler(model_sampling, steps, sgm=False, floor=False):
    start = s.timestep(s.sigma_max)
    end = s.timestep(s.sigma_min)

+    append_zero = True
    if sgm:
        timesteps = torch.linspace(start, end, steps + 1)[:-1]
    else:
+        if math.isclose(float(s.sigma(end)), 0, abs_tol=0.00001):
+            steps += 1
+            append_zero = False
        timesteps = torch.linspace(start, end, steps)

    sigs = []
    for x in range(len(timesteps)):
        ts = timesteps[x]
-        sigs.append(s.sigma(ts))
+        sigs.append(float(s.sigma(ts)))
+
+    if append_zero:
+        sigs += [0.0]
+
+    return torch.FloatTensor(sigs)
+
+# Implemented based on: https://arxiv.org/abs/2407.12173
+def beta_scheduler(model_sampling, steps, alpha=0.6, beta=0.6):
+    total_timesteps = (len(model_sampling.sigmas) - 1)
+    ts = 1 - numpy.linspace(0, 1, steps, endpoint=False)
+    ts = numpy.rint(scipy.stats.beta.ppf(ts, alpha, beta) * total_timesteps)
+
+    sigs = []
+    last_t = -1
+    for t in ts:
+        if t != last_t:
+            sigs += [float(model_sampling.sigmas[int(t)])]
+        last_t = t
    sigs += [0.0]
    return torch.FloatTensor(sigs)

+# from: https://github.com/genmoai/models/blob/main/src/mochi_preview/infer.py#L41
+def linear_quadratic_schedule(model_sampling, steps, threshold_noise=0.025, linear_steps=None):
+    if steps == 1:
+        sigma_schedule = [1.0, 0.0]
+    else:
+        if linear_steps is None:
+            linear_steps = steps // 2
+        linear_sigma_schedule = [i * threshold_noise / linear_steps for i in range(linear_steps)]
+        threshold_noise_step_diff = linear_steps - threshold_noise * steps
+        quadratic_steps = steps - linear_steps
+        quadratic_coef = threshold_noise_step_diff / (linear_steps * quadratic_steps ** 2)
+        linear_coef = threshold_noise / linear_steps - 2 * threshold_noise_step_diff / (quadratic_steps ** 2)
+        const = quadratic_coef * (linear_steps ** 2)
+        quadratic_sigma_schedule = [
+            quadratic_coef * (i ** 2) + linear_coef * i + const
+            for i in range(linear_steps, steps)
+        ]
+        sigma_schedule = linear_sigma_schedule + quadratic_sigma_schedule + [1.0]
+        sigma_schedule = [1.0 - x for x in sigma_schedule]
+    return torch.FloatTensor(sigma_schedule) * model_sampling.sigma_max.cpu()
+
 def get_mask_aabb(masks):
    if masks.numel() == 0:
        return torch.zeros((0, 4), device=masks.device, dtype=torch.int)
@@ -544,8 +594,8 @@ class Sampler:
        return math.isclose(max_sigma, sigma, rel_tol=1e-05) or sigma > max_sigma

 KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral",
-                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde", "dpmpp_sde_gpu",
-                  "dpmpp_2m", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
+                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
+                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
                  "ipndm", "ipndm_v", "deis"]

 class KSAMPLER(Sampler):
@@ -703,7 +753,7 @@ def sample(model, noise, positive, negative, cfg, device, sampler, sigmas, model
    return cfg_guider.sample(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed)


-SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform"]
+SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform", "beta", "linear_quadratic"]
 SAMPLER_NAMES = KSAMPLER_NAMES + ["ddim", "uni_pc", "uni_pc_bh2"]

 def calculate_sigmas(model_sampling, scheduler_name, steps):
@@ -719,6 +769,10 @@ def calculate_sigmas(model_sampling, scheduler_name, steps):
        sigmas = ddim_scheduler(model_sampling, steps)
    elif scheduler_name == "sgm_uniform":
        sigmas = normal_scheduler(model_sampling, steps, sgm=True)
+    elif scheduler_name == "beta":
+        sigmas = beta_scheduler(model_sampling, steps)
+    elif scheduler_name == "linear_quadratic":
+        sigmas = linear_quadratic_schedule(model_sampling, steps)
    else:
        logging.error("error invalid scheduler {}".format(scheduler_name))
    return sigmas
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -7,6 +7,8 @@ from .ldm.models.autoencoder import AutoencoderKL, AutoencodingEngine
 from .ldm.cascade.stage_a import StageA
 from .ldm.cascade.stage_c_coder import StageC_coder
 from .ldm.audio.autoencoder import AudioOobleckVAE
+import comfy.ldm.genmo.vae.model
+import comfy.ldm.lightricks.vae.causal_video_autoencoder
 import yaml

 import comfy.utils
@@ -17,18 +19,25 @@ from . import diffusers_convert
 from . import model_detection

 from . import sd1_clip
-from . import sd2_clip
 from . import sdxl_clip
-from . import sd3_clip
-from . import sa_t5
+import comfy.text_encoders.sd2_clip
+import comfy.text_encoders.sd3_clip
+import comfy.text_encoders.sa_t5
 import comfy.text_encoders.aura_t5
+import comfy.text_encoders.hydit
+import comfy.text_encoders.flux
+import comfy.text_encoders.long_clipl
+import comfy.text_encoders.genmo
+import comfy.text_encoders.lt

 import comfy.model_patcher
 import comfy.lora
+import comfy.lora_convert
 import comfy.t2i_adapter.adapter
-import comfy.supported_models_base
 import comfy.taesd.taesd

+import comfy.ldm.flux.redux
+
 def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
    key_map = {}
    if model is not None:
@@ -36,6 +45,7 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
    if clip is not None:
        key_map = comfy.lora.model_lora_keys_clip(clip.cond_stage_model, key_map)

+    lora = comfy.lora_convert.convert_lora(lora)
    loaded = comfy.lora.load_lora(lora, key_map)
    if model is not None:
        new_modelpatcher = model.clone()
@@ -60,29 +70,38 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):


 class CLIP:
-    def __init__(self, target=None, embedding_directory=None, no_init=False):
+    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}):
        if no_init:
            return
        params = target.params.copy()
        clip = target.clip
        tokenizer = target.tokenizer

-        load_device = model_management.text_encoder_device()
-        offload_device = model_management.text_encoder_offload_device()
-        params['device'] = offload_device
-        dtype = model_management.text_encoder_dtype(load_device)
+        load_device = model_options.get("load_device", model_management.text_encoder_device())
+        offload_device = model_options.get("offload_device", model_management.text_encoder_offload_device())
+        dtype = model_options.get("dtype", None)
+        if dtype is None:
+            dtype = model_management.text_encoder_dtype(load_device)
+
        params['dtype'] = dtype
+        params['device'] = model_options.get("initial_device", model_management.text_encoder_initial_device(load_device, offload_device, parameters * model_management.dtype_size(dtype)))
+        params['model_options'] = model_options

        self.cond_stage_model = clip(**(params))

        for dt in self.cond_stage_model.dtypes:
            if not model_management.supports_cast(load_device, dt):
                load_device = offload_device
+                if params['device'] != offload_device:
+                    self.cond_stage_model.to(offload_device)
+                    logging.warning("Had to shift TE back.")

-        self.tokenizer = tokenizer(embedding_directory=embedding_directory)
+        self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
+        if params['device'] == load_device:
+            model_management.load_models_gpu([self.patcher], force_full_load=True)
        self.layer_idx = None
-        logging.debug("CLIP model load device: {}, offload device: {}".format(load_device, offload_device))
+        logging.debug("CLIP model load device: {}, offload device: {}, current: {}".format(load_device, offload_device, params['device']))

    def clone(self):
        n = CLIP(no_init=True)
@@ -135,7 +154,11 @@ class CLIP:
            return self.cond_stage_model.load_sd(sd)

    def get_sd(self):
-        return self.cond_stage_model.state_dict()
+        sd_clip = self.cond_stage_model.state_dict()
+        sd_tokenizer = self.tokenizer.state_dict()
+        for k in sd_tokenizer:
+            sd_clip[k] = sd_tokenizer[k]
+        return sd_clip

    def load_model(self):
        model_management.load_model_gpu(self.patcher)
@@ -154,6 +177,7 @@ class VAE:
        self.downscale_ratio = 8
        self.upscale_ratio = 8
        self.latent_channels = 4
+        self.latent_dim = 2
        self.output_channels = 3
        self.process_input = lambda image: image * 2.0 - 1.0
        self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
@@ -223,9 +247,30 @@ class VAE:
                self.output_channels = 2
                self.upscale_ratio = 2048
                self.downscale_ratio =  2048
+                self.latent_dim = 1
                self.process_output = lambda audio: audio
                self.process_input = lambda audio: audio
                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+            elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae
+                if "blocks.2.blocks.3.stack.5.weight" in sd:
+                    sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."})
+                if "layers.4.layers.1.attn_block.attn.qkv.weight" in sd:
+                    sd = comfy.utils.state_dict_prefix_replace(sd, {"": "encoder."})
+                self.first_stage_model = comfy.ldm.genmo.vae.model.VideoVAE()
+                self.latent_channels = 12
+                self.latent_dim = 3
+                self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
+                self.memory_used_encode = lambda shape, dtype: (1.5 * max(shape[2], 7) * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
+                self.upscale_ratio = (lambda a: max(0, a * 6 - 5), 8, 8)
+                self.working_dtypes = [torch.float16, torch.float32]
+            elif "decoder.up_blocks.0.res_blocks.0.conv1.conv.weight" in sd: #lightricks ltxv
+                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE()
+                self.latent_channels = 128
+                self.latent_dim = 3
+                self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
+                self.memory_used_encode = lambda shape, dtype: (70 * max(shape[2], 7) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
+                self.upscale_ratio = 8
+                self.working_dtypes = [torch.bfloat16, torch.float32]
            else:
                logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
                self.first_stage_model = None
@@ -281,6 +326,10 @@ class VAE:
        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
        return comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device)

+    def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)):
+        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
+        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device))
+
    def encode_tiled_(self, pixel_samples, tile_x=512, tile_y=512, overlap = 64):
        steps = pixel_samples.shape[0] * comfy.utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x, tile_y, overlap)
        steps += pixel_samples.shape[0] * comfy.utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x // 2, tile_y * 2, overlap)
@@ -299,6 +348,7 @@ class VAE:
        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=(1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device)

    def decode(self, samples_in):
+        pixel_samples = None
        try:
            memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
@@ -306,38 +356,64 @@ class VAE:
            batch_number = int(free_memory / memory_used)
            batch_number = max(1, batch_number)

-            pixel_samples = torch.empty((samples_in.shape[0], self.output_channels) + tuple(map(lambda a: a * self.upscale_ratio, samples_in.shape[2:])), device=self.output_device)
            for x in range(0, samples_in.shape[0], batch_number):
                samples = samples_in[x:x+batch_number].to(self.vae_dtype).to(self.device)
-                pixel_samples[x:x+batch_number] = self.process_output(self.first_stage_model.decode(samples).to(self.output_device).float())
+                out = self.process_output(self.first_stage_model.decode(samples).to(self.output_device).float())
+                if pixel_samples is None:
+                    pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
+                pixel_samples[x:x+batch_number] = out
        except model_management.OOM_EXCEPTION as e:
            logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
-            if len(samples_in.shape) == 3:
+            dims = samples_in.ndim - 2
+            if dims == 1:
                pixel_samples = self.decode_tiled_1d(samples_in)
-            else:
+            elif dims == 2:
                pixel_samples = self.decode_tiled_(samples_in)
+            elif dims == 3:
+                pixel_samples = self.decode_tiled_3d(samples_in)

        pixel_samples = pixel_samples.to(self.output_device).movedim(1,-1)
        return pixel_samples

-    def decode_tiled(self, samples, tile_x=64, tile_y=64, overlap = 16):
-        model_management.load_model_gpu(self.patcher)
-        output = self.decode_tiled_(samples, tile_x, tile_y, overlap)
-        return output.movedim(1,-1)
+    def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None):
+        memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile
+        model_management.load_models_gpu([self.patcher], memory_required=memory_used)
+        dims = samples.ndim - 2
+        args = {}
+        if tile_x is not None:
+            args["tile_x"] = tile_x
+        if tile_y is not None:
+            args["tile_y"] = tile_y
+        if overlap is not None:
+            args["overlap"] = overlap
+
+        if dims == 1:
+            args.pop("tile_y")
+            output = self.decode_tiled_1d(samples, **args)
+        elif dims == 2:
+            output = self.decode_tiled_(samples, **args)
+        elif dims == 3:
+            output = self.decode_tiled_3d(samples, **args)
+        return output.movedim(1, -1)

    def encode(self, pixel_samples):
        pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
-        pixel_samples = pixel_samples.movedim(-1,1)
+        pixel_samples = pixel_samples.movedim(-1, 1)
+        if self.latent_dim == 3:
+            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
        try:
            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
-            batch_number = int(free_memory / memory_used)
+            batch_number = int(free_memory / max(1, memory_used))
            batch_number = max(1, batch_number)
-            samples = torch.empty((pixel_samples.shape[0], self.latent_channels) + tuple(map(lambda a: a // self.downscale_ratio, pixel_samples.shape[2:])), device=self.output_device)
+            samples = None
            for x in range(0, pixel_samples.shape[0], batch_number):
-                pixels_in = self.process_input(pixel_samples[x:x+batch_number]).to(self.vae_dtype).to(self.device)
-                samples[x:x+batch_number] = self.first_stage_model.encode(pixels_in).to(self.output_device).float()
+                pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype).to(self.device)
+                out = self.first_stage_model.encode(pixels_in).to(self.output_device).float()
+                if samples is None:
+                    samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
+                samples[x:x + batch_number] = out

        except model_management.OOM_EXCEPTION as e:
            logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
@@ -371,6 +447,8 @@ def load_style_model(ckpt_path):
    keys = model_data.keys()
    if "style_embedding" in keys:
        model = comfy.t2i_adapter.adapter.StyleAdapter(width=1024, context_dim=768, num_head=8, n_layes=3, num_token=8)
+    elif "redux_down.weight" in keys:
+        model = comfy.ldm.flux.redux.ReduxImageEncoder()
    else:
        raise Exception("invalid style model {}".format(ckpt_path))
    model.load_state_dict(model_data)
@@ -381,11 +459,56 @@ class CLIPType(Enum):
    STABLE_CASCADE = 2
    SD3 = 3
    STABLE_AUDIO = 4
+    HUNYUAN_DIT = 5
+    FLUX = 6
+    MOCHI = 7
+    LTXV = 8

-def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION):
+def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
    clip_data = []
    for p in ckpt_paths:
        clip_data.append(comfy.utils.load_torch_file(p, safe_load=True))
+    return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
+
+
+class TEModel(Enum):
+    CLIP_L = 1
+    CLIP_H = 2
+    CLIP_G = 3
+    T5_XXL = 4
+    T5_XL = 5
+    T5_BASE = 6
+
+def detect_te_model(sd):
+    if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
+        return TEModel.CLIP_G
+    if "text_model.encoder.layers.22.mlp.fc1.weight" in sd:
+        return TEModel.CLIP_H
+    if "text_model.encoder.layers.0.mlp.fc1.weight" in sd:
+        return TEModel.CLIP_L
+    if "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in sd:
+        weight = sd["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"]
+        if weight.shape[-1] == 4096:
+            return TEModel.T5_XXL
+        elif weight.shape[-1] == 2048:
+            return TEModel.T5_XL
+    if "encoder.block.0.layer.0.SelfAttention.k.weight" in sd:
+        return TEModel.T5_BASE
+    return None
+
+
+def t5xxl_detect(clip_data):
+    weight_name = "encoder.block.23.layer.1.DenseReluDense.wi_1.weight"
+
+    for sd in clip_data:
+        if weight_name in sd:
+            return comfy.text_encoders.sd3_clip.t5_xxl_detect(sd)
+
+    return {}
+
+
+def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
+    clip_data = state_dicts

    class EmptyClass:
        pass
@@ -400,43 +523,68 @@ def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DI
    clip_target = EmptyClass()
    clip_target.params = {}
    if len(clip_data) == 1:
-        if "text_model.encoder.layers.30.mlp.fc1.weight" in clip_data[0]:
+        te_model = detect_te_model(clip_data[0])
+        if te_model == TEModel.CLIP_G:
            if clip_type == CLIPType.STABLE_CASCADE:
                clip_target.clip = sdxl_clip.StableCascadeClipModel
                clip_target.tokenizer = sdxl_clip.StableCascadeTokenizer
+            elif clip_type == CLIPType.SD3:
+                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
+                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
            else:
                clip_target.clip = sdxl_clip.SDXLRefinerClipModel
                clip_target.tokenizer = sdxl_clip.SDXLTokenizer
-        elif "text_model.encoder.layers.22.mlp.fc1.weight" in clip_data[0]:
-            clip_target.clip = sd2_clip.SD2ClipModel
-            clip_target.tokenizer = sd2_clip.SD2Tokenizer
-        elif "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in clip_data[0]:
-            weight = clip_data[0]["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"]
-            dtype_t5 = weight.dtype
-            if weight.shape[-1] == 4096:
-                clip_target.clip = sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, dtype_t5=dtype_t5)
-                clip_target.tokenizer = sd3_clip.SD3Tokenizer
-            elif weight.shape[-1] == 2048:
-                clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model
-                clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer
-        elif "encoder.block.0.layer.0.SelfAttention.k.weight" in clip_data[0]:
-            clip_target.clip = sa_t5.SAT5Model
-            clip_target.tokenizer = sa_t5.SAT5Tokenizer
+        elif te_model == TEModel.CLIP_H:
+            clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel
+            clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer
+        elif te_model == TEModel.T5_XXL:
+            if clip_type == CLIPType.SD3:
+                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **t5xxl_detect(clip_data))
+                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
+            elif clip_type == CLIPType.LTXV:
+                clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data))
+                clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
+            else: #CLIPType.MOCHI
+                clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
+                clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
+        elif te_model == TEModel.T5_XL:
+            clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model
+            clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer
+        elif te_model == TEModel.T5_BASE:
+            clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
+            clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
        else:
-            clip_target.clip = sd1_clip.SD1ClipModel
-            clip_target.tokenizer = sd1_clip.SD1Tokenizer
+            if clip_type == CLIPType.SD3:
+                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
+                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
+            else:
+                clip_target.clip = sd1_clip.SD1ClipModel
+                clip_target.tokenizer = sd1_clip.SD1Tokenizer
    elif len(clip_data) == 2:
        if clip_type == CLIPType.SD3:
-            clip_target.clip = sd3_clip.sd3_clip(clip_l=True, clip_g=True, t5=False)
-            clip_target.tokenizer = sd3_clip.SD3Tokenizer
+            te_models = [detect_te_model(clip_data[0]), detect_te_model(clip_data[1])]
+            clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=TEModel.CLIP_L in te_models, clip_g=TEModel.CLIP_G in te_models, t5=TEModel.T5_XXL in te_models, **t5xxl_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
+        elif clip_type == CLIPType.HUNYUAN_DIT:
+            clip_target.clip = comfy.text_encoders.hydit.HyditModel
+            clip_target.tokenizer = comfy.text_encoders.hydit.HyditTokenizer
+        elif clip_type == CLIPType.FLUX:
+            clip_target.clip = comfy.text_encoders.flux.flux_clip(**t5xxl_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.flux.FluxTokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
    elif len(clip_data) == 3:
-        clip_target.clip = sd3_clip.SD3ClipModel
-        clip_target.tokenizer = sd3_clip.SD3Tokenizer
+        clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(**t5xxl_detect(clip_data))
+        clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer

-    clip = CLIP(clip_target, embedding_directory=embedding_directory)
+    parameters = 0
+    tokenizer_data = {}
+    for c in clip_data:
+        parameters += comfy.utils.calculate_parameters(c)
+        tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
+
+    clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options)
    for c in clip_data:
        m, u = clip.load_sd(c)
        if len(m) > 0:
@@ -478,25 +626,39 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl

    return (model, clip, vae)

-def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True):
+def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    sd = comfy.utils.load_torch_file(ckpt_path)
-    sd_keys = sd.keys()
+    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options)
+    if out is None:
+        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
+    return out
+
+def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    clip = None
    clipvision = None
    vae = None
    model = None
    model_patcher = None
-    clip_target = None

    diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
    parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
+    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
    load_device = model_management.get_torch_device()

    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
    if model_config is None:
-        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
+        return None
+
+    unet_weight_dtype = list(model_config.supported_inference_dtypes)
+    if weight_dtype is not None and model_config.scaled_fp8 is None:
+        unet_weight_dtype.append(weight_dtype)
+
+    model_config.custom_operations = model_options.get("custom_operations", None)
+    unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
+
+    if unet_dtype is None:
+        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)

-    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)

@@ -506,7 +668,6 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o

    if output_model:
        inital_load_device = model_management.unet_inital_load_device(parameters, unet_dtype)
-        offload_device = model_management.unet_offload_device()
        model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
        model.load_model_weights(sd, diffusion_model_prefix)

@@ -520,7 +681,8 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
        if clip_target is not None:
            clip_sd = model_config.process_clip_state_dict(sd)
            if len(clip_sd) > 0:
-                clip = CLIP(clip_target, embedding_directory=embedding_directory)
+                parameters = comfy.utils.calculate_parameters(clip_sd)
+                clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, model_options=te_model_options)
                m, u = clip.load_sd(clip_sd, full_model=True)
                if len(m) > 0:
                    m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m))
@@ -539,15 +701,16 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
        logging.debug("left over keys: {}".format(left_over))

    if output_model:
-        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device(), current_device=inital_load_device)
+        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device())
        if inital_load_device != torch.device("cpu"):
            logging.info("loaded straight to GPU")
-            model_management.load_model_gpu(model_patcher)
+            model_management.load_models_gpu([model_patcher], force_full_load=True)

    return (model_patcher, clip, vae, clipvision)


-def load_unet_state_dict(sd): #load unet in diffusers or regular format
+def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffusers or regular format
+    dtype = model_options.get("dtype", None)

    #Allow loading unets from checkpoint files
    diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
@@ -556,37 +719,49 @@ def load_unet_state_dict(sd): #load unet in diffusers or regular format
        sd = temp_sd

    parameters = comfy.utils.calculate_parameters(sd)
-    unet_dtype = model_management.unet_dtype(model_params=parameters)
+    weight_dtype = comfy.utils.weight_dtype(sd)
+
    load_device = model_management.get_torch_device()
    model_config = model_detection.model_config_from_unet(sd, "")

    if model_config is not None:
        new_sd = sd
-    elif 'transformer_blocks.0.attn.add_q_proj.weight' in sd: #MMDIT SD3
+    else:
        new_sd = model_detection.convert_diffusers_mmdit(sd, "")
-        if new_sd is None:
-            return None
-        model_config = model_detection.model_config_from_unet(new_sd, "")
-        if model_config is None:
-            return None
-    else: #diffusers
-        model_config = model_detection.model_config_from_diffusers_unet(sd)
-        if model_config is None:
-            return None
+        if new_sd is not None: #diffusers mmdit
+            model_config = model_detection.model_config_from_unet(new_sd, "")
+            if model_config is None:
+                return None
+        else: #diffusers unet
+            model_config = model_detection.model_config_from_diffusers_unet(sd)
+            if model_config is None:
+                return None

-        diffusers_keys = comfy.utils.unet_to_diffusers(model_config.unet_config)
+            diffusers_keys = comfy.utils.unet_to_diffusers(model_config.unet_config)

-        new_sd = {}
-        for k in diffusers_keys:
-            if k in sd:
-                new_sd[diffusers_keys[k]] = sd.pop(k)
-            else:
-                logging.warning("{} {}".format(diffusers_keys[k], k))
+            new_sd = {}
+            for k in diffusers_keys:
+                if k in sd:
+                    new_sd[diffusers_keys[k]] = sd.pop(k)
+                else:
+                    logging.warning("{} {}".format(diffusers_keys[k], k))

    offload_device = model_management.unet_offload_device()
-    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
+    unet_weight_dtype = list(model_config.supported_inference_dtypes)
+    if weight_dtype is not None and model_config.scaled_fp8 is None:
+        unet_weight_dtype.append(weight_dtype)
+
+    if dtype is None:
+        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)
+    else:
+        unet_dtype = dtype
+
    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
+    model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations)
+    if model_options.get("fp8_optimizations", False):
+        model_config.optimizations["fp8"] = True
+
    model = model_config.get_model(new_sd, "")
    model = model.to(offload_device)
    model.load_model_weights(new_sd, "")
@@ -595,24 +770,36 @@ def load_unet_state_dict(sd): #load unet in diffusers or regular format
        logging.info("left over keys in unet: {}".format(left_over))
    return comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=offload_device)

-def load_unet(unet_path):
+
+def load_diffusion_model(unet_path, model_options={}):
    sd = comfy.utils.load_torch_file(unet_path)
-    model = load_unet_state_dict(sd)
+    model = load_diffusion_model_state_dict(sd, model_options=model_options)
    if model is None:
        logging.error("ERROR UNSUPPORTED UNET {}".format(unet_path))
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path))
    return model

+def load_unet(unet_path, dtype=None):
+    print("WARNING: the load_unet function has been deprecated and will be removed please switch to: load_diffusion_model")
+    return load_diffusion_model(unet_path, model_options={"dtype": dtype})
+
+def load_unet_state_dict(sd, dtype=None):
+    print("WARNING: the load_unet_state_dict function has been deprecated and will be removed please switch to: load_diffusion_model_state_dict")
+    return load_diffusion_model_state_dict(sd, model_options={"dtype": dtype})
+
 def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, metadata=None, extra_keys={}):
    clip_sd = None
    load_models = [model]
    if clip is not None:
        load_models.append(clip.load_model())
        clip_sd = clip.get_sd()
+    vae_sd = None
+    if vae is not None:
+        vae_sd = vae.get_sd()

    model_management.load_models_gpu(load_models, force_patch_weights=True)
    clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None
-    sd = model.model.state_dict_for_saving(clip_sd, vae.get_sd(), clip_vision_sd)
+    sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
    for k in extra_keys:
        sd[k] = extra_keys[k]

--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -75,16 +75,15 @@ class ClipTokenWeightEncoder:
        return r

 class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
-    """Uses the CLIP transformer encoder for text (from huggingface)"""
    LAYERS = [
        "last",
        "pooled",
        "hidden"
    ]
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77,
+    def __init__(self, device="cpu", max_length=77,
                 freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=comfy.clip_model.CLIPTextModel,
                 special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False,
-                 return_projected_pooled=True, return_attention_masks=False):  # clip-vit-base-patch32
+                 return_projected_pooled=True, return_attention_masks=False, model_options={}):  # clip-vit-base-patch32
        super().__init__()
        assert layer in self.LAYERS

@@ -94,7 +93,21 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        with open(textmodel_json_config) as f:
            config = json.load(f)

-        self.transformer = model_class(config, dtype, device, comfy.ops.manual_cast)
+        operations = model_options.get("custom_operations", None)
+        scaled_fp8 = None
+
+        if operations is None:
+            scaled_fp8 = model_options.get("scaled_fp8", None)
+            if scaled_fp8 is not None:
+                operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
+            else:
+                operations = comfy.ops.manual_cast
+
+        self.operations = operations
+        self.transformer = model_class(config, dtype, device, self.operations)
+        if scaled_fp8 is not None:
+            self.transformer.scaled_fp8 = torch.nn.Parameter(torch.tensor([], dtype=scaled_fp8))
+
        self.num_layers = self.transformer.num_layers

        self.max_length = max_length
@@ -140,15 +153,13 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):

    def set_up_textual_embeddings(self, tokens, current_embeds):
        out_tokens = []
-        next_new_token = token_dict_size = current_embeds.weight.shape[0] - 1
+        next_new_token = token_dict_size = current_embeds.weight.shape[0]
        embedding_weights = []

        for x in tokens:
            tokens_temp = []
            for y in x:
                if isinstance(y, numbers.Integral):
-                    if y == token_dict_size: #EOS token
-                        y = -1
                    tokens_temp += [int(y)]
                else:
                    if y.shape[0] == current_embeds.weight.shape[1]:
@@ -163,12 +174,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):

        n = token_dict_size
        if len(embedding_weights) > 0:
-            new_embedding = torch.nn.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
-            new_embedding.weight[:token_dict_size] = current_embeds.weight[:-1]
+            new_embedding = self.operations.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
+            new_embedding.weight[:token_dict_size] = current_embeds.weight
            for x in embedding_weights:
                new_embedding.weight[n] = x
                n += 1
-            new_embedding.weight[n] = current_embeds.weight[-1] #EOS embedding
            self.transformer.set_input_embeddings(new_embedding)

        processed_tokens = []
@@ -197,7 +207,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        if self.enable_attention_masks:
            attention_mask_model = attention_mask

-        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state)
+        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
        self.transformer.set_input_embeddings(backup_embeds)

        if self.layer == "last":
@@ -315,6 +325,17 @@ def expand_directory_list(directories):
            dirs.add(root)
    return list(dirs)

+def bundled_embed(embed, prefix, suffix): #bundled embedding in lora format
+    i = 0
+    out_list = []
+    for k in embed:
+        if k.startswith(prefix) and k.endswith(suffix):
+            out_list.append(embed[k])
+    if len(out_list) == 0:
+        return None
+
+    return torch.cat(out_list, dim=0)
+
 def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=None):
    if isinstance(embedding_directory, str):
        embedding_directory = [embedding_directory]
@@ -381,12 +402,16 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
        elif embed_key is not None and embed_key in embed:
            embed_out = embed[embed_key]
        else:
-            values = embed.values()
-            embed_out = next(iter(values))
+            embed_out = bundled_embed(embed, 'bundle_emb.', '.string_to_param.*')
+            if embed_out is None:
+                embed_out = bundled_embed(embed, 'bundle_emb.', '.{}'.format(embed_key))
+            if embed_out is None:
+                values = embed.values()
+                embed_out = next(iter(values))
    return embed_out

 class SDTokenizer:
-    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True, min_length=None, pad_token=None):
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True, min_length=None, pad_token=None, tokenizer_data={}):
        if tokenizer_path is None:
            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
@@ -519,12 +544,15 @@ class SDTokenizer:
    def untokenize(self, token_weight_pair):
        return list(map(lambda a: (a, self.inv_vocab[a[0]]), token_weight_pair))

+    def state_dict(self):
+        return {}

 class SD1Tokenizer:
-    def __init__(self, embedding_directory=None, clip_name="l", tokenizer=SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}, clip_name="l", tokenizer=SDTokenizer):
        self.clip_name = clip_name
        self.clip = "clip_{}".format(self.clip_name)
-        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory))
+        tokenizer = tokenizer_data.get("{}_tokenizer_class".format(self.clip), tokenizer)
+        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data))

    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
@@ -534,9 +562,15 @@ class SD1Tokenizer:
    def untokenize(self, token_weight_pair):
        return getattr(self, self.clip).untokenize(token_weight_pair)

+    def state_dict(self):
+        return {}
+
+class SD1CheckpointClipModel(SDClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, return_projected_pooled=False, dtype=dtype, model_options=model_options)

 class SD1ClipModel(torch.nn.Module):
-    def __init__(self, device="cpu", dtype=None, clip_name="l", clip_model=SDClipModel, name=None, **kwargs):
+    def __init__(self, device="cpu", dtype=None, model_options={}, clip_name="l", clip_model=SD1CheckpointClipModel, name=None, **kwargs):
        super().__init__()

        if name is not None:
@@ -546,7 +580,8 @@ class SD1ClipModel(torch.nn.Module):
            self.clip_name = clip_name
            self.clip = "clip_{}".format(self.clip_name)

-        setattr(self, self.clip, clip_model(device=device, dtype=dtype, **kwargs))
+        clip_model = model_options.get("{}_class".format(self.clip), clip_model)
+        setattr(self, self.clip, clip_model(device=device, dtype=dtype, model_options=model_options, **kwargs))

        self.dtypes = set()
        if dtype is not None:
--- a/comfy/sd1_clip_config.json
+++ b/comfy/sd1_clip_config.json
@@ -6,7 +6,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 2,
+  "eos_token_id": 49407,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "initializer_factor": 1.0,
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@@ -3,26 +3,27 @@ import torch
 import os

 class SDXLClipG(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None, model_options={}):
        if layer == "penultimate":
            layer="hidden"
            layer_idx=-2

        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
-                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)
+                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False, return_projected_pooled=True, model_options=model_options)

    def load_sd(self, sd):
        return super().load_sd(sd)

 class SDXLClipGTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, tokenizer_path=None, embedding_directory=None):
+    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')


 class SDXLTokenizer:
-    def __init__(self, embedding_directory=None):
-        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory)
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)

    def tokenize_with_weights(self, text:str, return_word_ids=False):
@@ -34,11 +35,15 @@ class SDXLTokenizer:
    def untokenize(self, token_weight_pair):
        return self.clip_g.untokenize(token_weight_pair)

+    def state_dict(self):
+        return {}
+
 class SDXLClipModel(torch.nn.Module):
-    def __init__(self, device="cpu", dtype=None):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__()
-        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False)
-        self.clip_g = SDXLClipG(device=device, dtype=dtype)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
+        self.clip_l = clip_l_class(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, model_options=model_options)
+        self.clip_g = SDXLClipG(device=device, dtype=dtype, model_options=model_options)
        self.dtypes = set([dtype])

    def set_clip_options(self, options):
@@ -54,7 +59,8 @@ class SDXLClipModel(torch.nn.Module):
        token_weight_pairs_l = token_weight_pairs["l"]
        g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
-        return torch.cat([l_out, g_out], dim=-1), g_pooled
+        cut_to = min(l_out.shape[1], g_out.shape[1])
+        return torch.cat([l_out[:,:cut_to], g_out[:,:cut_to]], dim=-1), g_pooled

    def load_sd(self, sd):
        if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@@ -63,27 +69,27 @@ class SDXLClipModel(torch.nn.Module):
            return self.clip_l.load_sd(sd)

 class SDXLRefinerClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None):
-        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG)
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG, model_options=model_options)


 class StableCascadeClipGTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, tokenizer_path=None, embedding_directory=None):
+    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
        super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')

 class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None):
-        super().__init__(embedding_directory=embedding_directory, clip_name="g", tokenizer=StableCascadeClipGTokenizer)
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="g", tokenizer=StableCascadeClipGTokenizer)

 class StableCascadeClipG(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
-                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True)
+                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True, return_projected_pooled=True, model_options=model_options)

    def load_sd(self, sd):
        return super().load_sd(sd)

 class StableCascadeClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None):
-        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG)
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG, model_options=model_options)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -3,11 +3,15 @@ from . import model_base
 from . import utils

 from . import sd1_clip
-from . import sd2_clip
 from . import sdxl_clip
-from . import sd3_clip
-from . import sa_t5
+import comfy.text_encoders.sd2_clip
+import comfy.text_encoders.sd3_clip
+import comfy.text_encoders.sa_t5
 import comfy.text_encoders.aura_t5
+import comfy.text_encoders.hydit
+import comfy.text_encoders.flux
+import comfy.text_encoders.genmo
+import comfy.text_encoders.lt

 from . import supported_models_base
 from . import latent_formats
@@ -29,6 +33,7 @@ class SD15(supported_models_base.BASE):
    }

    latent_format = latent_formats.SD15
+    memory_usage_factor = 1.0

    def process_clip_state_dict(self, state_dict):
        k = list(state_dict.keys())
@@ -75,6 +80,7 @@ class SD20(supported_models_base.BASE):
    }

    latent_format = latent_formats.SD15
+    memory_usage_factor = 1.0

    def model_type(self, state_dict, prefix=""):
        if self.unet_config["in_channels"] == 4: #SD2.0 inpainting models are not v prediction
@@ -100,7 +106,7 @@ class SD20(supported_models_base.BASE):
        return state_dict

    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sd2_clip.SD2Tokenizer, sd2_clip.SD2ClipModel)
+        return supported_models_base.ClipTarget(comfy.text_encoders.sd2_clip.SD2Tokenizer, comfy.text_encoders.sd2_clip.SD2ClipModel)

 class SD21UnclipL(SD20):
    unet_config = {
@@ -138,6 +144,7 @@ class SDXLRefiner(supported_models_base.BASE):
    }

    latent_format = latent_formats.SDXL
+    memory_usage_factor = 1.0

    def get_model(self, state_dict, prefix="", device=None):
        return model_base.SDXLRefiner(self, device=device)
@@ -176,6 +183,8 @@ class SDXL(supported_models_base.BASE):

    latent_format = latent_formats.SDXL

+    memory_usage_factor = 0.8
+
    def model_type(self, state_dict, prefix=""):
        if 'edm_mean' in state_dict and 'edm_std' in state_dict: #Playground V2.5
            self.latent_format = latent_formats.SDXL_Playground_2_5()
@@ -189,6 +198,8 @@ class SDXL(supported_models_base.BASE):
                self.sampling_settings["sigma_min"] = float(state_dict["edm_vpred.sigma_min"].item())
            return model_base.ModelType.V_PREDICTION_EDM
        elif "v_pred" in state_dict:
+            if "ztsnr" in state_dict: #Some zsnr anime checkpoints
+                self.sampling_settings["zsnr"] = True
            return model_base.ModelType.V_PREDICTION
        else:
            return model_base.ModelType.EPS
@@ -503,6 +514,9 @@ class SD3(supported_models_base.BASE):

    unet_extra_config = {}
    latent_format = latent_formats.SD3
+
+    memory_usage_factor = 1.2
+
    text_encoder_key_prefix = ["text_encoders."]

    def get_model(self, state_dict, prefix="", device=None):
@@ -519,12 +533,11 @@ class SD3(supported_models_base.BASE):
            clip_l = True
        if "{}clip_g.transformer.text_model.final_layer_norm.weight".format(pref) in state_dict:
            clip_g = True
-        t5_key = "{}t5xxl.transformer.encoder.final_layer_norm.weight".format(pref)
-        if t5_key in state_dict:
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        if "dtype_t5" in t5_detect:
            t5 = True
-            dtype_t5 = state_dict[t5_key].dtype

-        return supported_models_base.ClipTarget(sd3_clip.SD3Tokenizer, sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5))
+        return supported_models_base.ClipTarget(comfy.text_encoders.sd3_clip.SD3Tokenizer, comfy.text_encoders.sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, **t5_detect))

 class StableAudio(supported_models_base.BASE):
    unet_config = {
@@ -555,7 +568,7 @@ class StableAudio(supported_models_base.BASE):
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)

    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sa_t5.SAT5Tokenizer, sa_t5.SAT5Model)
+        return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model)

 class AuraFlow(supported_models_base.BASE):
    unet_config = {
@@ -580,6 +593,144 @@ class AuraFlow(supported_models_base.BASE):
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.aura_t5.AuraT5Tokenizer, comfy.text_encoders.aura_t5.AuraT5Model)

-models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow]
+class HunyuanDiT(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hydit",
+    }
+
+    unet_extra_config = {
+        "attn_precision": torch.float32,
+    }
+
+    sampling_settings = {
+        "linear_start": 0.00085,
+        "linear_end": 0.018,
+    }
+
+    latent_format = latent_formats.SDXL
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanDiT(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.hydit.HyditTokenizer, comfy.text_encoders.hydit.HyditModel)
+
+class HunyuanDiT1(HunyuanDiT):
+    unet_config = {
+        "image_model": "hydit1",
+    }
+
+    unet_extra_config = {}
+
+    sampling_settings = {
+        "linear_start" : 0.00085,
+        "linear_end" : 0.03,
+    }
+
+class Flux(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "flux",
+        "guidance_embed": True,
+    }
+
+    sampling_settings = {
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux
+
+    memory_usage_factor = 2.8
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Flux(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
+
+class FluxSchnell(Flux):
+    unet_config = {
+        "image_model": "flux",
+        "guidance_embed": False,
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 1.0,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device)
+        return out
+
+class GenmoMochi(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "mochi_preview",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 6.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Mochi
+
+    memory_usage_factor = 2.0 #TODO
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.GenmoMochi(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.genmo.MochiT5Tokenizer, comfy.text_encoders.genmo.mochi_te(**t5_detect))
+
+class LTXV(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "ltxv",
+    }
+
+    sampling_settings = {
+        "shift": 2.37,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.LTXV
+
+    memory_usage_factor = 2.7
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.LTXV(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.lt.LTXVT5Tokenizer, comfy.text_encoders.lt.ltxv_te(**t5_detect))
+
+models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, HunyuanDiT, HunyuanDiT1, Flux, FluxSchnell, GenmoMochi, LTXV]

 models += [SVD_img2vid]
--- a/comfy/supported_models_base.py
+++ b/comfy/supported_models_base.py
@@ -1,3 +1,21 @@
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
 import torch
 from . import model_base
 from . import utils
@@ -27,7 +45,12 @@ class BASE:
    text_encoder_key_prefix = ["cond_stage_model."]
    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]

+    memory_usage_factor = 2.0
+
    manual_cast_dtype = None
+    custom_operations = None
+    scaled_fp8 = None
+    optimizations = {"fp8": False}

    @classmethod
    def matches(s, unet_config, state_dict=None):
@@ -50,6 +73,7 @@ class BASE:
        self.unet_config = unet_config.copy()
        self.sampling_settings = self.sampling_settings.copy()
        self.latent_format = self.latent_format()
+        self.optimizations = self.optimizations.copy()
        for x in self.unet_extra_config:
            self.unet_config[x] = self.unet_extra_config[x]

--- a/comfy/text_encoders/aura_t5.py
+++ b/comfy/text_encoders/aura_t5.py
@@ -1,22 +1,22 @@
 from comfy import sd1_clip
-from .llama_tokenizer import LLAMATokenizer
-import comfy.t5
+from .spiece_tokenizer import SPieceTokenizer
+import comfy.text_encoders.t5
 import os

 class PT5XlModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_config_xl.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.t5.T5, enable_attention_masks=True, zero_out_masked=True)
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True, model_options=model_options)

 class PT5XlTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_tokenizer"), "tokenizer.model")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=LLAMATokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)

 class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None):
-        super().__init__(embedding_directory=embedding_directory, clip_name="pile_t5xl", tokenizer=PT5XlTokenizer)
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="pile_t5xl", tokenizer=PT5XlTokenizer)

 class AuraT5Model(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, **kwargs):
-        super().__init__(device=device, dtype=dtype, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+        super().__init__(device=device, dtype=dtype, model_options=model_options, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -0,0 +1,140 @@
+import torch
+from comfy.ldm.modules.attention import optimized_attention_for_device
+import comfy.ops
+
+class BertAttention(torch.nn.Module):
+    def __init__(self, embed_dim, heads, dtype, device, operations):
+        super().__init__()
+
+        self.heads = heads
+        self.query = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.key = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.value = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+
+
+    def forward(self, x, mask=None, optimized_attention=None):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+
+        out = optimized_attention(q, k, v, self.heads, mask)
+        return out
+
+class BertOutput(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
+        self.LayerNorm = operations.LayerNorm(output_dim, eps=layer_norm_eps, dtype=dtype, device=device)
+        # self.dropout = nn.Dropout(0.0)
+
+    def forward(self, x, y):
+        x = self.dense(x)
+        # hidden_states = self.dropout(hidden_states)
+        x = self.LayerNorm(x + y)
+        return x
+
+class BertAttentionBlock(torch.nn.Module):
+    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.self = BertAttention(embed_dim, heads, dtype, device, operations)
+        self.output = BertOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+
+    def forward(self, x, mask, optimized_attention):
+        y = self.self(x, mask, optimized_attention)
+        return self.output(y, x)
+
+class BertIntermediate(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_dim, dtype, device, operations):
+        super().__init__()
+        self.dense = operations.Linear(embed_dim, intermediate_dim, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = self.dense(x)
+        return torch.nn.functional.gelu(x)
+
+
+class BertBlock(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = BertAttentionBlock(embed_dim, heads, layer_norm_eps, dtype, device, operations)
+        self.intermediate = BertIntermediate(embed_dim, intermediate_dim, dtype, device, operations)
+        self.output = BertOutput(intermediate_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+
+    def forward(self, x, mask, optimized_attention):
+        x = self.attention(x, mask, optimized_attention)
+        y = self.intermediate(x)
+        return self.output(y, x)
+
+class BertEncoder(torch.nn.Module):
+    def __init__(self, num_layers, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.layer = torch.nn.ModuleList([BertBlock(embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations) for i in range(num_layers)])
+
+    def forward(self, x, mask=None, intermediate_output=None):
+        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
+
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layer) + intermediate_output
+
+        intermediate = None
+        for i, l in enumerate(self.layer):
+            x = l(x, mask, optimized_attention)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+class BertEmbeddings(torch.nn.Module):
+    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size, pad_token_id, embed_dim, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.word_embeddings = operations.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id, dtype=dtype, device=device)
+        self.position_embeddings = operations.Embedding(max_position_embeddings, embed_dim, dtype=dtype, device=device)
+        self.token_type_embeddings = operations.Embedding(type_vocab_size, embed_dim, dtype=dtype, device=device)
+
+        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, input_tokens, token_type_ids=None, dtype=None):
+        x = self.word_embeddings(input_tokens, out_dtype=dtype)
+        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
+        if token_type_ids is not None:
+            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
+        else:
+            x += comfy.ops.cast_to_input(self.token_type_embeddings.weight[0], x)
+        x = self.LayerNorm(x)
+        return x
+
+
+class BertModel_(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        embed_dim = config_dict["hidden_size"]
+        layer_norm_eps = config_dict["layer_norm_eps"]
+
+        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
+        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)
+
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        x = self.embeddings(input_tokens, dtype=dtype)
+        mask = None
+        if attention_mask is not None:
+            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
+
+        x, i = self.encoder(x, mask, intermediate_output)
+        return x, i
+
+
+class BertModel(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        self.bert = BertModel_(config_dict, dtype, device, operations)
+        self.num_layers = config_dict["num_hidden_layers"]
+
+    def get_input_embeddings(self):
+        return self.bert.embeddings.word_embeddings
+
+    def set_input_embeddings(self, embeddings):
+        self.bert.embeddings.word_embeddings = embeddings
+
+    def forward(self, *args, **kwargs):
+        return self.bert(*args, **kwargs)
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -0,0 +1,72 @@
+from comfy import sd1_clip
+import comfy.text_encoders.t5
+import comfy.text_encoders.sd3_clip
+import comfy.model_management
+from transformers import T5TokenizerFast
+import torch
+import os
+
+class T5XXLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
+
+
+class FluxTokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
+        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
+        out = {}
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
+        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.clip_l.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {}
+
+
+class FluxClipModel(torch.nn.Module):
+    def __init__(self, dtype_t5=None, device="cpu", dtype=None, model_options={}):
+        super().__init__()
+        dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
+        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
+        self.t5xxl = comfy.text_encoders.sd3_clip.T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options)
+        self.dtypes = set([dtype, dtype_t5])
+
+    def set_clip_options(self, options):
+        self.clip_l.set_clip_options(options)
+        self.t5xxl.set_clip_options(options)
+
+    def reset_clip_options(self):
+        self.clip_l.reset_clip_options()
+        self.t5xxl.reset_clip_options()
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_l = token_weight_pairs["l"]
+        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
+
+        t5_out, t5_pooled = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
+        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
+        return t5_out, l_pooled
+
+    def load_sd(self, sd):
+        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
+            return self.clip_l.load_sd(sd)
+        else:
+            return self.t5xxl.load_sd(sd)
+
+def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None):
+    class FluxClipModel_(FluxClipModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
+            super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
+    return FluxClipModel_
--- a/Show More
+++ b/Show More