Optimizations to --fast and scaled fp8.

Fix duplicate sigmas on beta scheduler.
Mixed precision diffusion models with scaled fp8.
2024-10-22 02:12:28 -04:00 · 2024-10-21 20:19:45 -04:00 · 2024-10-21 18:12:51 -04:00 · 2024-10-20 22:27:00 -04:00 · 2024-10-20 06:24:31 -04:00 · 2024-10-20 00:54:47 -04:00
292 changed files with 194201 additions and 44157 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -62,12 +62,38 @@ except:
 print("checking out master branch")
 branch = repo.lookup_branch('master')
-ref = repo.lookup_reference(branch.name)
+if branch is None:
-repo.checkout(ref)
+    ref = repo.lookup_reference('refs/remotes/origin/master')
    repo.checkout(ref)
    branch = repo.lookup_branch('master')
    if branch is None:
        repo.create_branch('master', repo.get(ref.target))
 else:
    ref = repo.lookup_reference(branch.name)
    repo.checkout(ref)
 print("pulling latest changes")
 pull(repo)
 if "--stable" in sys.argv:
    def latest_tag(repo):
        versions = []
        for k in repo.references:
            try:
                prefix = "refs/tags/v"
                if k.startswith(prefix):
                    version = list(map(int, k[len(prefix):].split(".")))
                    versions.append((version[0] * 10000000000 + version[1] * 100000 + version[2], k))
            except:
                pass
        versions.sort()
        if len(versions) > 0:
            return versions[-1][1]
        return None
    latest_tag = latest_tag(repo)
    if latest_tag is not None:
        repo.checkout(latest_tag)
 print("Done!")
 self_update = True
@@ -108,3 +134,13 @@ if not os.path.exists(req_path) or not files_equal(repo_req_path, req_path):
        shutil.copy(repo_req_path, req_path)
    except:
        pass
 stable_update_script = os.path.join(repo_path, ".ci/update_windows/update_comfyui_stable.bat")
 stable_update_script_to = os.path.join(cur_path, "update_comfyui_stable.bat")
 try:
    if not file_size(stable_update_script_to) > 10:
        shutil.copy(stable_update_script, stable_update_script_to)
 except:
    pass
--- a/.ci/update_windows/update_comfyui_stable.bat
+++ b/.ci/update_windows/update_comfyui_stable.bat
@@ -0,0 +1,8 @@
@echo off
 ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --stable
 if exist update_new.py (
  move /y update_new.py update.py
  echo Running updater again since it got updated.
  ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --skip_self_update --stable
 )
 if "%~1"=="" pause
--- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
@@ -14,7 +14,7 @@ run_cpu.bat
 IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints
-You can download the stable diffusion 1.5 one from: https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt
+You can download the stable diffusion 1.5 one from: https://huggingface.co/Comfy-Org/stable-diffusion-v1-5-archive/blob/main/v1-5-pruned-emaonly-fp16.safetensors
 RECOMMENDED WAY TO UPDATE:
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
@@ -0,0 +1,2 @@
 .\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast
 pause
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
 /web/assets/** linguist-generated
 /web/** linguist-vendored
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,8 @@
 blank_issues_enabled: true
 contact_links:
  - name: ComfyUI Frontend Issues
    url: https://github.com/Comfy-Org/ComfyUI_frontend/issues
    about: Issues related to the ComfyUI frontend (display issues, user interaction bugs), please go to the frontend repo to file the issue
  - name: ComfyUI Matrix Space
    url: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
    about: The ComfyUI Matrix Space is available for support and general discussion related to ComfyUI (Matrix is like Discord but open source).
--- a/.github/workflows/pullrequest-ci-run.yml
+++ b/.github/workflows/pullrequest-ci-run.yml
@@ -0,0 +1,53 @@
 # This is the GitHub Workflow that drives full-GPU-enabled tests of pull requests to ComfyUI, when the 'Run-CI-Test' label is added
 # Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
 name: Pull Request CI Workflow Runs
 on:
    pull_request_target:
        types: [labeled]
 jobs:
  pr-test-stable:
    if: ${{ github.event.label.name == 'Run-CI-Test' }}
    strategy:
      fail-fast: false
      matrix:
        os: [macos, linux, windows]
        python_version: ["3.9", "3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["stable"]
        include:
          - os: macos
            runner_label: [self-hosted, macOS]
            flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
          - os: windows
            runner_label: [self-hosted, Windows]
            flags: ""
    runs-on: ${{ matrix.runner_label }}
    steps:
      - name: Test Workflows
        uses: comfy-org/comfy-action@main
        with:
          os: ${{ matrix.os }}
          python_version: ${{ matrix.python_version }}
          torch_version: ${{ matrix.torch_version }}
          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
          comfyui_flags: ${{ matrix.flags }}
          use_prior_commit: 'true'
  comment:
    if: ${{ github.event.label.name == 'Run-CI-Test' }}
    runs-on: ubuntu-latest
    permissions:
      pull-requests: write
    steps:
      - uses: actions/github-script@v6
        with:
          script: |
            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: '(Automated Bot Message) CI Tests are running, you can view the results at https://ci.comfy.org/?branch=${{ github.event.pull_request.number }}%2Fmerge'
            })
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,23 @@
 name: Python Linting
 on: [push, pull_request]
 jobs:
  pylint:
    name: Run Pylint
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: 3.x
    - name: Install Pylint
      run: pip install pylint
    - name: Run Pylint
      run: pylint --rcfile=.pylintrc $(find . -type f -name "*.py")
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -2,9 +2,28 @@
 name: "Release Stable Version"
 on:
-  push:
+  workflow_dispatch:
-    tags:
+    inputs:
-      - 'v*'
+      git_tag:
        description: 'Git tag'
        required: true
        type: string
      cu:
        description: 'CUDA version'
        required: true
        type: string
        default: "124"
      python_minor:
        description: 'Python minor version'
        required: true
        type: string
        default: "12"
      python_patch:
        description: 'Python patch version'
        required: true
        type: string
        default: "7"
 jobs:
  package_comfy_windows:
@@ -13,69 +32,44 @@ jobs:
      packages: "write"
      pull-requests: "read"
    runs-on: windows-latest
    strategy:
      matrix:
        python_version: [3.11.8]
        cuda_version: [121]
    steps:
      - name: Calculate Minor Version
        shell: bash
        run: |
          # Extract the minor version from the Python version
          MINOR_VERSION=$(echo "${{ matrix.python_version }}" | cut -d'.' -f2)
          echo "MINOR_VERSION=$MINOR_VERSION" >> $GITHUB_ENV
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.git_tag }}
          fetch-depth: 0
          persist-credentials: false
      - uses: actions/cache/restore@v4
        id: cache
        with:
          path: |
            cu${{ inputs.cu }}_python_deps.tar
            update_comfyui_and_python_dependencies.bat
          key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
      - shell: bash
        run: |
-          echo "@echo off
+          mv cu${{ inputs.cu }}_python_deps.tar ../
          call update_comfyui.bat nopause
          echo -
          echo This will try to update pytorch and all python dependencies.
          echo -
          echo If you just want to update normally, close this and run update_comfyui.bat instead.
          echo -
          pause
          ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r ../ComfyUI/requirements.txt pygit2
          pause" > update_comfyui_and_python_dependencies.bat
          python -m pip wheel --no-cache-dir torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r requirements.txt pygit2 -w ./temp_wheel_dir
          python -m pip install --no-cache-dir ./temp_wheel_dir/*
          echo installed basic
          ls -lah temp_wheel_dir
          mv temp_wheel_dir cu${{ matrix.cuda_version }}_python_deps
          mv cu${{ matrix.cuda_version }}_python_deps ../
          mv update_comfyui_and_python_dependencies.bat ../
          cd ..
          tar xf cu${{ inputs.cu }}_python_deps.tar
          pwd
          ls
-          
+
      - shell: bash
        run: |
          cd ..
          cp -r ComfyUI ComfyUI_copy
-          curl https://www.python.org/ftp/python/${{ matrix.python_version }}/python-${{ matrix.python_version }}-embed-amd64.zip -o python_embeded.zip
+          curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
          unzip python_embeded.zip -d python_embeded
          cd python_embeded
          echo ${{ env.MINOR_VERSION }}
-          echo 'import site' >> ./python3${{ env.MINOR_VERSION }}._pth
+          echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
          curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
          ./python.exe get-pip.py
-          ./python.exe --version
+          ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
-          echo "Pip version:"
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
-          ./python.exe -m pip --version
+            cd ..
-          set PATH=$PWD/Scripts:$PATH
+          git clone --depth 1 https://github.com/comfyanonymous/taesd
          echo $PATH
          ./python.exe -s -m pip install ../cu${{ matrix.cuda_version }}_python_deps/*
          sed -i '1i../ComfyUI' ./python3${{ env.MINOR_VERSION }}._pth
          cd ..
          git clone https://github.com/comfyanonymous/taesd
          cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
          mkdir ComfyUI_windows_portable
@@ -104,6 +98,7 @@ jobs:
        with:
          repo_token: ${{ secrets.GITHUB_TOKEN }}
          file: ComfyUI_windows_portable_nvidia.7z
-          tag: ${{ github.ref }}
+          tag: ${{ inputs.git_tag }}
          overwrite: true
-        
+          prerelease: true
          make_latest: false
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -0,0 +1,21 @@
 name: 'Close stale issues'
 on:
  schedule:
    # Run daily at 430 am PT
    - cron: '30 11 * * *'
 permissions:
  issues: write
 jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/stale@v9
        with:
          stale-issue-message: "This issue is being marked stale because it has not had any activity for 30 days. Reply below within 7 days if your issue still isn't solved, and it will be left open. Otherwise, the issue will be closed automatically."
          days-before-stale: 30
          days-before-close: 7
          stale-issue-label: 'Stale'
          only-labels: 'User Support'
          exempt-all-assignees: true
          exempt-all-milestones: true
--- a/.github/workflows/test-browser.yml
+++ b/.github/workflows/test-browser.yml
@@ -1,76 +0,0 @@
 # This is a temporary action during frontend TS migration.
 # This file should be removed after TS migration is completed.
 # The browser test is here to ensure TS repo is working the same way as the
 # current JS code.
 # If you are adding UI feature, please sync your changes to the TS repo:
 # huchenlei/ComfyUI_frontend and update test expectation files accordingly.
 name: Playwright Browser Tests CI
 on:
  push:
    branches: [ main, master ]
  pull_request:
    branches: [ main, master ]
 jobs:
  test:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout ComfyUI
      uses: actions/checkout@v4
      with:
        repository: "comfyanonymous/ComfyUI"
        path: "ComfyUI"
    - name: Checkout ComfyUI_frontend
      uses: actions/checkout@v4
      with:
        repository: "huchenlei/ComfyUI_frontend"
        path: "ComfyUI_frontend"
        ref: "fcc54d803e5b6a9b08a462a1d94899318c96dcbb"
    - uses: actions/setup-node@v3
      with:
        node-version: lts/*
    - uses: actions/setup-python@v4
      with:
        python-version: '3.10'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
        pip install -r requirements.txt
        pip install wait-for-it
      working-directory: ComfyUI
    - name: Start ComfyUI server
      run: |
        python main.py --cpu 2>&1 | tee console_output.log &
        wait-for-it --service 127.0.0.1:8188 -t 600
      working-directory: ComfyUI
    - name: Install ComfyUI_frontend dependencies
      run: |
        npm ci
      working-directory: ComfyUI_frontend
    - name: Install Playwright Browsers
      run: npx playwright install --with-deps
      working-directory: ComfyUI_frontend
    - name: Run Playwright tests
      run: npx playwright test
      working-directory: ComfyUI_frontend
    - name: Check for unhandled exceptions in server log
      run: |
        if grep -qE "Exception|Error" console_output.log; then
          echo "Unhandled exception/error found in server log."
          exit 1
        fi
      working-directory: ComfyUI
    - uses: actions/upload-artifact@v4
      if: always()
      with:
        name: playwright-report
        path: ComfyUI_frontend/playwright-report/
        retention-days: 30
    - uses: actions/upload-artifact@v4
      if: always()
      with:
        name: console-output
        path: ComfyUI/console_output.log
        retention-days: 30
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@@ -0,0 +1,95 @@
 # This is the GitHub Workflow that drives automatic full-GPU-enabled tests of all new commits to the master branch of ComfyUI
 # Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
 name: Full Comfy CI Workflow Runs
 on:
  push:
    branches:
      - master
    paths-ignore:
      - 'app/**'
      - 'input/**'
      - 'output/**'
      - 'notebooks/**'
      - 'script_examples/**'
      - '.github/**'
      - 'web/**'
  workflow_dispatch:
 jobs:
  test-stable:
    strategy:
      fail-fast: false
      matrix:
        os: [macos, linux, windows]
        python_version: ["3.9", "3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["stable"]
        include:
          - os: macos
            runner_label: [self-hosted, macOS]
            flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
          - os: windows
            runner_label: [self-hosted, Windows]
            flags: ""
    runs-on: ${{ matrix.runner_label }}
    steps:
      - name: Test Workflows
        uses: comfy-org/comfy-action@main
        with:
          os: ${{ matrix.os }}
          python_version: ${{ matrix.python_version }}
          torch_version: ${{ matrix.torch_version }}
          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
          comfyui_flags: ${{ matrix.flags }}
  test-win-nightly:
    strategy:
      fail-fast: true
      matrix:
        os: [windows]
        python_version: ["3.9", "3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["nightly"]
        include:
          - os: windows
            runner_label: [self-hosted, Windows]
            flags: ""
    runs-on: ${{ matrix.runner_label }}
    steps:
      - name: Test Workflows
        uses: comfy-org/comfy-action@main
        with:
          os: ${{ matrix.os }}
          python_version: ${{ matrix.python_version }}
          torch_version: ${{ matrix.torch_version }}
          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
          comfyui_flags: ${{ matrix.flags }}
  test-unix-nightly:
    strategy:
      fail-fast: false
      matrix:
        os: [macos, linux]
        python_version: ["3.11"]
        cuda_version: ["12.1"]
        torch_version: ["nightly"]
        include:
          - os: macos
            runner_label: [self-hosted, macOS]
            flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
    runs-on: ${{ matrix.runner_label }}
    steps:
      - name: Test Workflows
        uses: comfy-org/comfy-action@main
        with:
          os: ${{ matrix.os }}
          python_version: ${{ matrix.python_version }}
          torch_version: ${{ matrix.torch_version }}
          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
          comfyui_flags: ${{ matrix.flags }}
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -0,0 +1,45 @@
 name: Test server launches without errors
 on:
  push:
    branches: [ main, master ]
  pull_request:
    branches: [ main, master ]
 jobs:
  test:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout ComfyUI
      uses: actions/checkout@v4
      with:
        repository: "comfyanonymous/ComfyUI"
        path: "ComfyUI"
    - uses: actions/setup-python@v4
      with:
        python-version: '3.8'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
        pip install -r requirements.txt
        pip install wait-for-it
      working-directory: ComfyUI
    - name: Start ComfyUI server
      run: |
        python main.py --cpu 2>&1 | tee console_output.log &
        wait-for-it --service 127.0.0.1:8188 -t 600
      working-directory: ComfyUI
    - name: Check for unhandled exceptions in server log
      run: |
        if grep -qE "Exception|Error" console_output.log; then
          echo "Unhandled exception/error found in server log."
          exit 1
        fi
      working-directory: ComfyUI
    - uses: actions/upload-artifact@v4
      if: always()
      with:
        name: console-output
        path: ComfyUI/console_output.log
        retention-days: 30
--- a/.github/workflows/test-ui.yaml
+++ b/.github/workflows/test-ui.yaml
@@ -1,26 +0,0 @@
 name: Tests CI
 on: [push, pull_request]
 jobs:
  test:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    - uses: actions/setup-node@v3
      with:
        node-version: 18
    - uses: actions/setup-python@v4
      with: 
        python-version: '3.10'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
        pip install -r requirements.txt
    - name: Run Tests
      run: | 
        npm ci
        npm run test:generate
        npm test -- --verbose
      working-directory: ./tests-ui
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@@ -0,0 +1,30 @@
 name: Unit Tests
 on:
  push:
    branches: [ main, master ]
  pull_request:
    branches: [ main, master ]
 jobs:
  test:
    strategy:
      matrix:
        os: [ubuntu-latest, windows-latest, macos-latest]
    runs-on: ${{ matrix.os }}
    continue-on-error: true
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python      
      uses: actions/setup-python@v4
      with:
        python-version: '3.10'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
        pip install -r requirements.txt
    - name: Run Unit Tests
      run: |
        pip install -r tests-unit/requirements.txt
        python -m pytest tests-unit
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -8,23 +8,28 @@ on:
        required: false
        type: string
        default: ""
      extra_dependencies:
        description: 'extra dependencies'
        required: false
        type: string
        default: ""
      cu:
        description: 'cuda version'
        required: true
        type: string
-        default: "121"
+        default: "124"
      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "11"
+        default: "12"
      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "8"
+        default: "7"
 #  push:
 #    branches:
 #      - master
@@ -51,7 +56,7 @@ jobs:
            ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
            pause" > update_comfyui_and_python_dependencies.bat
-            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
            python -m pip install --no-cache-dir ./temp_wheel_dir/*
            echo installed basic
            ls -lah temp_wheel_dir
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "3"
+        default: "4"
 #  push:
 #    branches:
 #      - master
@@ -49,13 +49,13 @@ jobs:
            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
            ./python.exe get-pip.py
-            python -m pip wheel torch torchvision torchaudio mpmath==1.3.0 numpy==1.26.4 --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
            ls ../temp_wheel_dir
            ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            cd ..
-            git clone https://github.com/comfyanonymous/taesd
+            git clone --depth 1 https://github.com/comfyanonymous/taesd
            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
            mkdir ComfyUI_windows_portable_nightly_pytorch
@@ -67,6 +67,7 @@ jobs:
            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
            cp -r ComfyUI/.ci/windows_base_files/* ./
            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./
            echo "call update_comfyui.bat nopause
            ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -7,19 +7,19 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "121"
+        default: "124"
      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "11"
+        default: "12"
      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "8"
+        default: "7"
 #  push:
 #    branches:
 #      - master
@@ -66,7 +66,7 @@ jobs:
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            cd ..
-            git clone https://github.com/comfyanonymous/taesd
+            git clone --depth 1 https://github.com/comfyanonymous/taesd
            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
            mkdir ComfyUI_windows_portable
--- a/.gitignore
+++ b/.gitignore
@@ -12,9 +12,12 @@ extra_model_paths.yaml
 .vscode/
 .idea/
 venv/
 .venv/
 /web/extensions/*
 !/web/extensions/logging.js.example
 !/web/extensions/core/
 /tests-ui/data/object_info.json
 /user/
-*.log
+*.log
 web_custom_versions/
 .DS_Store
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,3 @@
 [MESSAGES CONTROL]
 disable=all
 enable=eval-used
--- a/README.md
+++ b/README.md
@@ -1,8 +1,35 @@
-ComfyUI
+<div align="center">
-=======
+
-The most powerful and modular stable diffusion GUI and backend.
+# ComfyUI
-----------
+**The most powerful and modular diffusion model GUI and backend.**
 [![Website][website-shield]][website-url]
 [![Dynamic JSON Badge][discord-shield]][discord-url]
 [![Matrix][matrix-shield]][matrix-url]
 <br>
 [![][github-release-shield]][github-release-link]
 [![][github-release-date-shield]][github-release-link]
 [![][github-downloads-shield]][github-downloads-link]
 [![][github-downloads-latest-shield]][github-downloads-link]
 [matrix-shield]: https://img.shields.io/badge/Matrix-000000?style=flat&logo=matrix&logoColor=white
 [matrix-url]: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
 [website-shield]: https://img.shields.io/badge/ComfyOrg-4285F4?style=flat
 [website-url]: https://www.comfy.org/
 <!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
 [discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
 [discord-url]: https://www.comfy.org/discord
 [github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
 [github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
 [github-release-date-shield]: https://img.shields.io/github/release-date/comfyanonymous/ComfyUI?style=flat
 [github-downloads-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/total?style=flat
 [github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
 [github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
 ![ComfyUI Screenshot](comfyui_screenshot.png)
 </div>
 This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
 ### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
@@ -12,6 +39,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@@ -32,6 +60,8 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
 - [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
 - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
 - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
@@ -45,6 +75,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 |------------------------------------|--------------------------------------------------------------------------------------------------------------------|
 | Ctrl + Enter                       | Queue up current graph for generation                                                                              |
 | Ctrl + Shift + Enter               | Queue up current graph as first for generation                                                                     |
 | Ctrl + Alt + Enter                 | Cancel current generation                                                                                          |
 | Ctrl + Z/Ctrl + Y                  | Undo/Redo                                                                                                          |
 | Ctrl + S                           | Save workflow                                                                                                      |
 | Ctrl + O                           | Load workflow                                                                                                      |
@@ -63,10 +94,14 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 | Alt + `+`                          | Canvas Zoom in                                                                                                     |
 | Alt + `-`                          | Canvas Zoom out                                                                                                    |
 | Ctrl + Shift + LMB + Vertical drag | Canvas Zoom in/out                                                                                                 |
 | P                                  | Pin/Unpin selected nodes                                                                                           |
 | Ctrl + G                           | Group selected nodes                                                                                               |
 | Q                                  | Toggle visibility of the queue                                                                                     |
 | H                                  | Toggle visibility of history                                                                                       |
 | R                                  | Refresh graph                                                                                                      |
 | Double-Click LMB                   | Open node quick search palette                                                                                     |
 | Shift + Drag                       | Move multiple wires at once                                                                                        |
 | Ctrl + Alt + LMB                   | Disconnect all wires from clicked slot                                                                             |
 Ctrl can also be replaced with Cmd instead for macOS users
@@ -76,7 +111,7 @@ Ctrl can also be replaced with Cmd instead for macOS users
 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).
-### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu121_or_cpu.7z)
+### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)
 Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
@@ -92,6 +127,8 @@ To run it on services like paperspace, kaggle or colab you can use my [Jupyter N
 ## Manual Install (Windows, Linux)
 Note that some dependencies do not yet support python 3.13 so using 3.12 is recommended.
 Git clone this repo.
 Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
@@ -102,17 +139,17 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1```
-This is the command to install the nightly with ROCm 6.0 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.2 which might have some performance improvements:
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.1```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2```
 ### NVIDIA
 Nvidia users should install stable pytorch using this command:
-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124```
 This is the command to install pytorch nightly instead which might have performance improvements:
@@ -162,20 +199,6 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve
 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
 ### I already have another UI for Stable Diffusion installed do I really have to install all of these dependencies?
 You don't. If you have another UI installed and working with its own python venv you can use that venv to run ComfyUI. You can open up your favorite terminal and activate it:
 ```source path_to_other_sd_gui/venv/bin/activate```
 or on Windows:
 With Powershell: ```"path_to_other_sd_gui\venv\Scripts\Activate.ps1"```
 With cmd.exe: ```"path_to_other_sd_gui\venv\Scripts\activate.bat"```
 And then you can use that terminal to run ComfyUI without installing any dependencies. Note that the venv folder might be called something else depending on the SD UI.
 # Running
 ```python main.py```
@@ -211,7 +234,7 @@ To use a textual inversion concepts/embeddings in a text prompt put them in the
 Use ```--preview-method auto``` to enable previews.
-The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) (for SD1.x and SD2.x) and [taesdxl_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesdxl_decoder.pth) (for SDXL) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.
+The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth, taesdxl_decoder.pth, taesd3_decoder.pth and taef1_decoder.pth](https://github.com/madebyollin/taesd/) and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI and launch it with `--preview-method taesd` to enable high-quality previews.
 ## How to use TLS/SSL?
 Generate a self-signed certificate (not appropriate for shared/production use) and key by running the command: `openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname"`
@@ -227,6 +250,47 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w
 See also: [https://www.comfy.org/](https://www.comfy.org/)
 ## Frontend Development
 As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
 ### Reporting Issues and Requesting Features
 For any bugs, issues, or feature requests related to the frontend, please use the [ComfyUI Frontend repository](https://github.com/Comfy-Org/ComfyUI_frontend). This will help us manage and address frontend-specific concerns more efficiently.
 ### Using the Latest Frontend
 The new frontend is now the default for ComfyUI. However, please note:
 1. The frontend in the main ComfyUI repository is updated weekly.
 2. Daily releases are available in the separate frontend repository.
 To use the most up-to-date frontend version:
 1. For the latest daily release, launch ComfyUI with this command line argument:
   ```
   --front-end-version Comfy-Org/ComfyUI_frontend@latest
   ```
 2. For a specific version, replace `latest` with the desired version number:
   ```
   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
   ```
 This approach allows you to easily switch between the stable weekly release and the cutting-edge daily updates, or even specific versions for testing purposes.
 ### Accessing the Legacy Frontend
 If you need to use the legacy frontend for any reason, you can access it using the following command line argument:
 ```
 --front-end-version Comfy-Org/ComfyUI_legacy_frontend@latest
 ```
 This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy Frontend repository](https://github.com/Comfy-Org/ComfyUI_legacy_frontend).
 # QA
 ### Which GPU should I buy for this?
--- a/api_server/init.py
+++ b/api_server/init.py
--- a/api_server/routes/init.py
+++ b/api_server/routes/init.py
--- a/api_server/routes/internal/README.md
+++ b/api_server/routes/internal/README.md
@@ -0,0 +1,3 @@
 # ComfyUI Internal Routes
 All routes under the `/internal` path are designated for **internal use by ComfyUI only**. These routes are not intended for use by external applications may change at any time without notice.
--- a/api_server/routes/internal/init.py
+++ b/api_server/routes/internal/init.py
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@@ -0,0 +1,51 @@
 from aiohttp import web
 from typing import Optional
 from folder_paths import models_dir, user_directory, output_directory, folder_names_and_paths
 from api_server.services.file_service import FileService
 import app.logger
 class InternalRoutes:
    '''
    The top level web router for internal routes: /internal/*
    The endpoints here should NOT be depended upon. It is for ComfyUI frontend use only.
    Check README.md for more information.
    '''
    def __init__(self):
        self.routes: web.RouteTableDef = web.RouteTableDef()
        self._app: Optional[web.Application] = None
        self.file_service = FileService({
            "models": models_dir,
            "user": user_directory,
            "output": output_directory
        })
    def setup_routes(self):
        @self.routes.get('/files')
        async def list_files(request):
            directory_key = request.query.get('directory', '')
            try:
                file_list = self.file_service.list_files(directory_key)
                return web.json_response({"files": file_list})
            except ValueError as e:
                return web.json_response({"error": str(e)}, status=400)
            except Exception as e:
                return web.json_response({"error": str(e)}, status=500)
        @self.routes.get('/logs')
        async def get_logs(request):
            return web.json_response(app.logger.get_logs())
        @self.routes.get('/folder_paths')
        async def get_folder_paths(request):
            response = {}
            for key in folder_names_and_paths:
                response[key] = folder_names_and_paths[key][0]
            return web.json_response(response)
    def get_app(self):
        if self._app is None:
            self._app = web.Application()
            self.setup_routes()
            self._app.add_routes(self.routes)
        return self._app
--- a/api_server/services/init.py
+++ b/api_server/services/init.py
--- a/api_server/services/file_service.py
+++ b/api_server/services/file_service.py
@@ -0,0 +1,13 @@
 from typing import Dict, List, Optional
 from api_server.utils.file_operations import FileSystemOperations, FileSystemItem
 class FileService:
    def __init__(self, allowed_directories: Dict[str, str], file_system_ops: Optional[FileSystemOperations] = None):
        self.allowed_directories: Dict[str, str] = allowed_directories
        self.file_system_ops: FileSystemOperations = file_system_ops or FileSystemOperations()
    def list_files(self, directory_key: str) -> List[FileSystemItem]:
        if directory_key not in self.allowed_directories:
            raise ValueError("Invalid directory key")
        directory_path: str = self.allowed_directories[directory_key]
        return self.file_system_ops.walk_directory(directory_path)
--- a/api_server/utils/file_operations.py
+++ b/api_server/utils/file_operations.py
@@ -0,0 +1,42 @@
 import os
 from typing import List, Union, TypedDict, Literal
 from typing_extensions import TypeGuard
 class FileInfo(TypedDict):
    name: str
    path: str
    type: Literal["file"]
    size: int
 class DirectoryInfo(TypedDict):
    name: str
    path: str
    type: Literal["directory"]
 FileSystemItem = Union[FileInfo, DirectoryInfo]
 def is_file_info(item: FileSystemItem) -> TypeGuard[FileInfo]:
    return item["type"] == "file"
 class FileSystemOperations:
    @staticmethod
    def walk_directory(directory: str) -> List[FileSystemItem]:
        file_list: List[FileSystemItem] = []
        for root, dirs, files in os.walk(directory):
            for name in files:
                file_path = os.path.join(root, name)
                relative_path = os.path.relpath(file_path, directory)
                file_list.append({
                    "name": name,
                    "path": relative_path,
                    "type": "file",
                    "size": os.path.getsize(file_path)
                })
            for name in dirs:
                dir_path = os.path.join(root, name)
                relative_path = os.path.relpath(dir_path, directory)
                file_list.append({
                    "name": name,
                    "path": relative_path,
                    "type": "directory"
                })
        return file_list
--- a/app/init.py
+++ b/app/init.py
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -0,0 +1,208 @@
 from __future__ import annotations
 import argparse
 import logging
 import os
 import re
 import tempfile
 import zipfile
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import TypedDict, Optional
 import requests
 from typing_extensions import NotRequired
 from comfy.cli_args import DEFAULT_VERSION_STRING
 REQUEST_TIMEOUT = 10  # seconds
 class Asset(TypedDict):
    url: str
 class Release(TypedDict):
    id: int
    tag_name: str
    name: str
    prerelease: bool
    created_at: str
    published_at: str
    body: str
    assets: NotRequired[list[Asset]]
@dataclass
 class FrontEndProvider:
    owner: str
    repo: str
    @property
    def folder_name(self) -> str:
        return f"{self.owner}_{self.repo}"
    @property
    def release_url(self) -> str:
        return f"https://api.github.com/repos/{self.owner}/{self.repo}/releases"
    @cached_property
    def all_releases(self) -> list[Release]:
        releases = []
        api_url = self.release_url
        while api_url:
            response = requests.get(api_url, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()  # Raises an HTTPError if the response was an error
            releases.extend(response.json())
            # GitHub uses the Link header to provide pagination links. Check if it exists and update api_url accordingly.
            if "next" in response.links:
                api_url = response.links["next"]["url"]
            else:
                api_url = None
        return releases
    @cached_property
    def latest_release(self) -> Release:
        latest_release_url = f"{self.release_url}/latest"
        response = requests.get(latest_release_url, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()  # Raises an HTTPError if the response was an error
        return response.json()
    def get_release(self, version: str) -> Release:
        if version == "latest":
            return self.latest_release
        else:
            for release in self.all_releases:
                if release["tag_name"] in [version, f"v{version}"]:
                    return release
            raise ValueError(f"Version {version} not found in releases")
 def download_release_asset_zip(release: Release, destination_path: str) -> None:
    """Download dist.zip from github release."""
    asset_url = None
    for asset in release.get("assets", []):
        if asset["name"] == "dist.zip":
            asset_url = asset["url"]
            break
    if not asset_url:
        raise ValueError("dist.zip not found in the release assets")
    # Use a temporary file to download the zip content
    with tempfile.TemporaryFile() as tmp_file:
        headers = {"Accept": "application/octet-stream"}
        response = requests.get(
            asset_url, headers=headers, allow_redirects=True, timeout=REQUEST_TIMEOUT
        )
        response.raise_for_status()  # Ensure we got a successful response
        # Write the content to the temporary file
        tmp_file.write(response.content)
        # Go back to the beginning of the temporary file
        tmp_file.seek(0)
        # Extract the zip file content to the destination path
        with zipfile.ZipFile(tmp_file, "r") as zip_ref:
            zip_ref.extractall(destination_path)
 class FrontendManager:
    DEFAULT_FRONTEND_PATH = str(Path(__file__).parents[1] / "web")
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
        Args:
            value (str): The version string to parse.
        Returns:
            tuple[str, str]: A tuple containing provider name and version.
        Raises:
            argparse.ArgumentTypeError: If the version string is invalid.
        """
        VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+|latest)$"
        match_result = re.match(VERSION_PATTERN, value)
        if match_result is None:
            raise argparse.ArgumentTypeError(f"Invalid version string: {value}")
        return match_result.group(1), match_result.group(2), match_result.group(3)
    @classmethod
    def init_frontend_unsafe(cls, version_string: str, provider: Optional[FrontEndProvider] = None) -> str:
        """
        Initializes the frontend for the specified version.
        Args:
            version_string (str): The version string.
            provider (FrontEndProvider, optional): The provider to use. Defaults to None.
        Returns:
            str: The path to the initialized frontend.
        Raises:
            Exception: If there is an error during the initialization process.
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
            return cls.DEFAULT_FRONTEND_PATH
        repo_owner, repo_name, version = cls.parse_version_string(version_string)
        if version.startswith("v"):
            expected_path = str(Path(cls.CUSTOM_FRONTENDS_ROOT) / f"{repo_owner}_{repo_name}" / version.lstrip("v"))
            if os.path.exists(expected_path):
                logging.info(f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}")
                return expected_path
        logging.info(f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub...")
        provider = provider or FrontEndProvider(repo_owner, repo_name)
        release = provider.get_release(version)
        semantic_version = release["tag_name"].lstrip("v")
        web_root = str(
            Path(cls.CUSTOM_FRONTENDS_ROOT) / provider.folder_name / semantic_version
        )
        if not os.path.exists(web_root):
            # Use tmp path until complete to avoid path exists check passing from interrupted downloads
            tmp_path = web_root + ".tmp"
            try:
                os.makedirs(tmp_path, exist_ok=True)
                logging.info(
                    "Downloading frontend(%s) version(%s) to (%s)",
                    provider.folder_name,
                    semantic_version,
                    tmp_path,
                )
                logging.debug(release)
                download_release_asset_zip(release, destination_path=tmp_path)
                if os.listdir(tmp_path):
                    os.rename(tmp_path, web_root)
            finally:
                # Clean up the directory if it is empty, i.e. the download failed
                if not os.listdir(web_root):
                    os.rmdir(web_root)
        return web_root
    @classmethod
    def init_frontend(cls, version_string: str) -> str:
        """
        Initializes the frontend with the specified version string.
        Args:
            version_string (str): The version string to initialize the frontend with.
        Returns:
            str: The path of the initialized frontend.
        """
        try:
            return cls.init_frontend_unsafe(version_string)
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
            return cls.DEFAULT_FRONTEND_PATH
--- a/app/logger.py
+++ b/app/logger.py
@@ -0,0 +1,31 @@
 import logging
 from logging.handlers import MemoryHandler
 from collections import deque
 logs = None
 formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 def get_logs():
    return "\n".join([formatter.format(x) for x in logs])
 def setup_logger(log_level: str = 'INFO', capacity: int = 300):
    global logs
    if logs:
        return
    # Setup default global logger
    logger = logging.getLogger()
    logger.setLevel(log_level)
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(logging.Formatter("%(message)s"))
    logger.addHandler(stream_handler)
    # Create a memory handler with a deque as its buffer
    logs = deque(maxlen=capacity)
    memory_handler = MemoryHandler(capacity, flushLevel=logging.INFO)
    memory_handler.buffer = logs
    memory_handler.setFormatter(formatter)
    logger.addHandler(memory_handler)
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -5,17 +5,17 @@ import uuid
 import glob
 import shutil
 from aiohttp import web
 from urllib import parse
 from comfy.cli_args import args
-from folder_paths import user_directory
+import folder_paths
 from .app_settings import AppSettings
 default_user = "default"
 users_file = os.path.join(user_directory, "users.json")
 class UserManager():
    def __init__(self):
-        global user_directory
+        user_directory = folder_paths.get_user_directory()
        self.settings = AppSettings(self)
        if not os.path.exists(user_directory):
@@ -25,14 +25,17 @@ class UserManager():
                print("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")
        if args.multi_user:
-            if os.path.isfile(users_file):
+            if os.path.isfile(self.get_users_file()):
-                with open(users_file) as f:
+                with open(self.get_users_file()) as f:
                    self.users = json.load(f)
            else:
                self.users = {}
        else:
            self.users = {"default": "default"}
    def get_users_file(self):
        return os.path.join(folder_paths.get_user_directory(), "users.json")
    def get_request_user_id(self, request):
        user = "default"
        if args.multi_user and "comfy-user" in request.headers:
@@ -44,7 +47,7 @@ class UserManager():
        return user
    def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
-        global user_directory
+        user_directory = folder_paths.get_user_directory()
        if type == "userdata":
            root_dir = user_directory
@@ -59,6 +62,10 @@ class UserManager():
            return None
        if file is not None:
            # Check if filename is url encoded
            if "%" in file:
                file = parse.unquote(file)
            # prevent leaving /{type}/{user}
            path = os.path.abspath(os.path.join(user_root, file))
            if os.path.commonpath((user_root, path)) != user_root:
@@ -80,8 +87,7 @@ class UserManager():
        self.users[user_id] = name
-        global users_file
+        with open(self.get_users_file(), "w") as f:
        with open(users_file, "w") as f:
            json.dump(self.users, f)
        return user_id
@@ -112,25 +118,69 @@ class UserManager():
        @routes.get("/userdata")
        async def listuserdata(request):
            """
            List user data files in a specified directory.
            This endpoint allows listing files in a user's data directory, with options for recursion,
            full file information, and path splitting.
            Query Parameters:
            - dir (required): The directory to list files from.
            - recurse (optional): If "true", recursively list files in subdirectories.
            - full_info (optional): If "true", return detailed file information (path, size, modified time).
            - split (optional): If "true", split file paths into components (only applies when full_info is false).
            Returns:
            - 400: If 'dir' parameter is missing.
            - 403: If the requested path is not allowed.
            - 404: If the requested directory does not exist.
            - 200: JSON response with the list of files or file information.
            The response format depends on the query parameters:
            - Default: List of relative file paths.
            - full_info=true: List of dictionaries with file details.
            - split=true (and full_info=false): List of lists, each containing path components.
            """
            directory = request.rel_url.query.get('dir', '')
            if not directory:
-                return web.Response(status=400)
+                return web.Response(status=400, text="Directory not provided")
-                
+
            path = self.get_request_user_filepath(request, directory)
            if not path:
-                return web.Response(status=403)
+                return web.Response(status=403, text="Invalid directory")
-            
+
            if not os.path.exists(path):
-                return web.Response(status=404)
+                return web.Response(status=404, text="Directory not found")
-            
+
            recurse = request.rel_url.query.get('recurse', '').lower() == "true"
-            results = glob.glob(os.path.join(
+            full_info = request.rel_url.query.get('full_info', '').lower() == "true"
-                glob.escape(path), '**/*'), recursive=recurse)
+
-            results = [os.path.relpath(x, path) for x in results if os.path.isfile(x)]
+            # Use different patterns based on whether we're recursing or not
-            
+            if recurse:
                pattern = os.path.join(glob.escape(path), '**', '*')
            else:
                pattern = os.path.join(glob.escape(path), '*')
            results = glob.glob(pattern, recursive=recurse)
            if full_info:
                results = [
                    {
                        'path': os.path.relpath(x, path).replace(os.sep, '/'),
                        'size': os.path.getsize(x),
                        'modified': os.path.getmtime(x)
                    } for x in results if os.path.isfile(x)
                ]
            else:
                results = [
                    os.path.relpath(x, path).replace(os.sep, '/')
                    for x in results
                    if os.path.isfile(x)
                ]
            split_path = request.rel_url.query.get('split', '').lower() == "true"
-            if split_path:
+            if split_path and not full_info:
-                results = [[x] + x.split(os.sep) for x in results]
+                results = [[x] + x.split('/') for x in results]
            return web.json_response(results)
@@ -138,14 +188,14 @@ class UserManager():
            file = request.match_info.get(param, None)
            if not file:
                return web.Response(status=400)
-                
+
            path = self.get_request_user_filepath(request, file)
            if not path:
                return web.Response(status=403)
-            
+
            if check_exists and not os.path.exists(path):
                return web.Response(status=404)
-            
+
            return path
        @routes.get("/userdata/{file}")
@@ -153,7 +203,7 @@ class UserManager():
            path = get_user_data_path(request, check_exists=True)
            if not isinstance(path, str):
                return path
-            
+
            return web.FileResponse(path)
        @routes.post("/userdata/{file}")
@@ -161,7 +211,7 @@ class UserManager():
            path = get_user_data_path(request)
            if not isinstance(path, str):
                return path
-            
+
            overwrite = request.query["overwrite"] != "false"
            if not overwrite and os.path.exists(path):
                return web.Response(status=409)
@@ -170,7 +220,7 @@ class UserManager():
            with open(path, "wb") as f:
                f.write(body)
-                
+
            resp = os.path.relpath(path, self.get_request_user_filepath(request, None))
            return web.json_response(resp)
@@ -181,7 +231,7 @@ class UserManager():
                return path
            os.remove(path)
-                
+
            return web.Response(status=204)
        @routes.post("/userdata/{file}/move/{dest}")
@@ -189,17 +239,17 @@ class UserManager():
            source = get_user_data_path(request, check_exists=True)
            if not isinstance(source, str):
                return source
-            
+
            dest = get_user_data_path(request, check_exists=False, param="dest")
            if not isinstance(source, str):
                return dest
-            
+
            overwrite = request.query["overwrite"] != "false"
            if not overwrite and os.path.exists(dest):
                return web.Response(status=409)
            print(f"moving '{source}' -> '{dest}'")
            shutil.move(source, dest)
-                
+
            resp = os.path.relpath(dest, self.get_request_user_filepath(request, None))
            return web.json_response(resp)
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@@ -13,6 +13,7 @@ from ..ldm.modules.diffusionmodules.util import (
 from ..ldm.modules.attention import SpatialTransformer
 from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
 from ..ldm.util import exists
 from .control_types import UNION_CONTROLNET_TYPES
 from collections import OrderedDict
 import comfy.ops
 from comfy.ldm.modules.attention import optimized_attention
@@ -92,7 +93,7 @@ class ControlNet(nn.Module):
        transformer_depth_middle=None,
        transformer_depth_output=None,
        attn_precision=None,
-        union_controlnet=False,
+        union_controlnet_num_control_type=None,
        device=None,
        operations=comfy.ops.disable_weight_init,
        **kwargs,
@@ -320,8 +321,8 @@ class ControlNet(nn.Module):
        self.middle_block_out = self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device)
        self._feature_size += ch
-        if union_controlnet:
+        if union_controlnet_num_control_type is not None:
-            self.num_control_type = 6
+            self.num_control_type = union_controlnet_num_control_type
            num_trans_channel = 320
            num_trans_head = 8
            num_trans_layer = 1
@@ -361,7 +362,7 @@ class ControlNet(nn.Module):
            controlnet_cond = self.input_hint_block(hint[idx], emb, context)
            feat_seq = torch.mean(controlnet_cond, dim=(2, 3))
            if idx < len(control_type):
-                feat_seq += self.task_embedding[control_type[idx]]
+                feat_seq += self.task_embedding[control_type[idx]].to(dtype=feat_seq.dtype, device=feat_seq.device)
            inputs.append(feat_seq.unsqueeze(1))
            condition_list.append(controlnet_cond)
@@ -390,6 +391,18 @@ class ControlNet(nn.Module):
        if self.control_add_embedding is not None: #Union Controlnet
            control_type = kwargs.get("control_type", [])
            if any([c >= self.num_control_type for c in control_type]):
                max_type = max(control_type)
                max_type_name = {
                    v: k for k, v in UNION_CONTROLNET_TYPES.items()
                }[max_type]
                raise ValueError(
                    f"Control type {max_type_name}({max_type}) is out of range for the number of control types" +
                    f"({self.num_control_type}) supported.\n" +
                    "Please consider using the ProMax ControlNet Union model.\n" +
                    "https://huggingface.co/xinsir/controlnet-union-sdxl-1.0/tree/main"
                )
            emb += self.control_add_embedding(control_type, emb.dtype, emb.device)
            if len(control_type) > 0:
                if len(hint.shape) < 5:
--- a/comfy/cldm/control_types.py
+++ b/comfy/cldm/control_types.py
@@ -0,0 +1,10 @@
 UNION_CONTROLNET_TYPES = {
    "openpose": 0,
    "depth": 1,
    "hed/pidi/scribble/ted": 2,
    "canny/lineart/anime_lineart/mlsd": 3,
    "normal": 4,
    "segment": 5,
    "tile": 6,
    "repaint": 7,
 }
--- a/comfy/cldm/mmdit.py
+++ b/comfy/cldm/mmdit.py
@@ -6,6 +6,7 @@ class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
    def __init__(
        self,
        num_blocks = None,
        control_latent_channels = None,
        dtype = None,
        device = None,
        operations = None,
@@ -17,10 +18,13 @@ class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
        for _ in range(len(self.joint_blocks)):
            self.controlnet_blocks.append(operations.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype))
        if control_latent_channels is None:
            control_latent_channels = self.in_channels
        self.pos_embed_input = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(
            None,
            self.patch_size,
-            self.in_channels,
+            control_latent_channels,
            self.hidden_size,
            bias=True,
            strict_img_size=False,
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,7 +1,10 @@
 import argparse
 import enum
 import os
 from typing import Optional
 import comfy.options
 class EnumAction(argparse.Action):
    """
    Argparse action for handling Enums
@@ -33,7 +36,7 @@ class EnumAction(argparse.Action):
 parser = argparse.ArgumentParser()
-parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)")
+parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0,::", help="Specify the IP address to listen on (default: 127.0.0.1). You can give a list of ip addresses by separating them with a comma like: 127.2.2.2,127.3.3.3 If --listen is provided without an argument, it defaults to 0.0.0.0,:: (listens on all ipv4 and ipv6)")
 parser.add_argument("--port", type=int, default=8188, help="Set the listen port.")
 parser.add_argument("--tls-keyfile", type=str, help="Path to TLS (SSL) key file. Enables TLS, makes app accessible at https://... requires --tls-certfile to function")
 parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certificate file. Enables TLS, makes app accessible at https://... requires --tls-keyfile to function")
@@ -89,6 +92,12 @@ class LatentPreviewMethod(enum.Enum):
 parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)
 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
@@ -109,9 +118,14 @@ vram_group.add_argument("--lowvram", action="store_true", help="Split the unet i
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
 parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reverved depending on your OS.")
 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")
 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
 parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
@@ -122,8 +136,42 @@ parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Dis
 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
-parser.add_argument("--verbose", action="store_true", help="Enables more debug prints.")
+parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
 parser.add_argument(
    "--front-end-version",
    type=str,
    default=DEFAULT_VERSION_STRING,
    help="""
    Specifies the version of the frontend to be used. This command needs internet connectivity to query and
    download available frontend implementations from GitHub releases.
    The version string should be in the format of:
    [repoOwner]/[repoName]@[version]
    where version is one of: "latest" or a valid version number (e.g. "1.0.0")
    """,
 )
 def is_valid_directory(path: Optional[str]) -> Optional[str]:
    """Validate if the given path is a directory."""
    if path is None:
        return None
    if not os.path.isdir(path):
        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
    return path
 parser.add_argument(
    "--front-end-root",
    type=is_valid_directory,
    default=None,
    help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.",
 )
 parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path.")
 if comfy.options.args_parsing:
    args = parser.parse_args()
@@ -135,10 +183,3 @@ if args.windows_standalone_build:
 if args.disable_auto_launch:
    args.auto_launch = False
 import logging
 logging_level = logging.INFO
 if args.verbose:
    logging_level = logging.DEBUG
 logging.basicConfig(format="%(message)s", level=logging_level)
--- a/comfy/clip_config_bigg.json
+++ b/comfy/clip_config_bigg.json
@@ -5,7 +5,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 2,
+  "eos_token_id": 49407,
  "hidden_act": "gelu",
  "hidden_size": 1280,
  "initializer_factor": 1.0,
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -1,5 +1,6 @@
 import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.ops
 class CLIPAttention(torch.nn.Module):
    def __init__(self, embed_dim, heads, dtype, device, operations):
@@ -71,13 +72,13 @@ class CLIPEncoder(torch.nn.Module):
        return x, intermediate
 class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
+    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None, operations=None):
        super().__init__()
-        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
+        self.token_embedding = operations.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
-    def forward(self, input_tokens):
+    def forward(self, input_tokens, dtype=torch.float32):
-        return self.token_embedding(input_tokens) + self.position_embedding.weight
+        return self.token_embedding(input_tokens, out_dtype=dtype) + comfy.ops.cast_to(self.position_embedding.weight, dtype=dtype, device=input_tokens.device)
 class CLIPTextModel_(torch.nn.Module):
@@ -87,14 +88,16 @@ class CLIPTextModel_(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]
        num_positions = config_dict["max_position_embeddings"]
        self.eos_token_id = config_dict["eos_token_id"]
        super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
+        self.embeddings = CLIPEmbeddings(embed_dim, num_positions=num_positions, dtype=dtype, device=device, operations=operations)
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
-        x = self.embeddings(input_tokens)
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -111,7 +114,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)
-        pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output
 class CLIPTextModel(torch.nn.Module):
@@ -121,7 +124,6 @@ class CLIPTextModel(torch.nn.Module):
        self.text_model = CLIPTextModel_(config_dict, dtype, device, operations)
        embed_dim = config_dict["hidden_size"]
        self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
        self.text_projection.weight.copy_(torch.eye(embed_dim))
        self.dtype = dtype
    def get_input_embeddings(self):
@@ -153,11 +155,11 @@ class CLIPVisionEmbeddings(torch.nn.Module):
        num_patches = (image_size // patch_size) ** 2
        num_positions = num_patches + 1
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
    def forward(self, pixel_values):
        embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
-        return torch.cat([self.class_embedding.to(embeds.device).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight.to(embeds.device)
+        return torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)
 class CLIPVision(torch.nn.Module):
@@ -169,7 +171,7 @@ class CLIPVision(torch.nn.Module):
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]
-        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=dtype, device=device, operations=operations)
        self.pre_layrnorm = operations.LayerNorm(embed_dim)
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.post_layernorm = operations.LayerNorm(embed_dim)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -34,6 +34,7 @@ class ClipVisionModel():
        with open(json_config) as f:
            config = json.load(f)
        self.image_size = config.get("image_size", 224)
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
@@ -50,7 +51,7 @@ class ClipVisionModel():
    def encode_image(self, image):
        comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = clip_preprocess(image.to(self.load_device)).float()
+        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size).float()
        out = self.model(pixel_values=pixel_values, intermediate_output=-2)
        outputs = Output()
@@ -93,7 +94,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
-        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
+        if sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
    else:
        return None
@@ -105,8 +109,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    keys = list(sd.keys())
    for k in keys:
        if k not in u:
-            t = sd.pop(k)
+            sd.pop(k)
            del t
    return clip
 def load(ckpt_path):
--- a/comfy/clip_vision_config_vitl_336.json
+++ b/comfy/clip_vision_config_vitl_336.json
@@ -0,0 +1,18 @@
 {
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 336,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-5,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "torch_dtype": "float32"
 }
--- a/comfy/comfy_types.py
+++ b/comfy/comfy_types.py
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -1,4 +1,24 @@
 """
    This file is part of ComfyUI.
    Copyright (C) 2024 Comfy
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import torch
 from enum import Enum
 import math
 import os
 import logging
@@ -13,6 +33,8 @@ import comfy.cldm.cldm
 import comfy.t2i_adapter.adapter
 import comfy.ldm.cascade.controlnet
 import comfy.cldm.mmdit
 import comfy.ldm.hydit.controlnet
 import comfy.ldm.flux.controlnet
 def broadcast_image_to(tensor, target_batch_size, batched_number):
@@ -33,6 +55,10 @@ def broadcast_image_to(tensor, target_batch_size, batched_number):
    else:
        return torch.cat([tensor] * batched_number, dim=0)
 class StrengthType(Enum):
    CONSTANT = 1
    LINEAR_UP = 2
 class ControlBase:
    def __init__(self, device=None):
        self.cond_hint_original = None
@@ -45,18 +71,29 @@ class ControlBase:
        self.timestep_range = None
        self.compression_ratio = 8
        self.upscale_algorithm = 'nearest-exact'
        self.extra_args = {}
        if device is None:
            device = comfy.model_management.get_torch_device()
        self.device = device
        self.previous_controlnet = None
        self.extra_conds = []
        self.strength_type = StrengthType.CONSTANT
        self.concat_mask = False
        self.extra_concat_orig = []
        self.extra_concat = None
-    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None):
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
        self.cond_hint_original = cond_hint
        self.strength = strength
        self.timestep_percent_range = timestep_percent_range
        if self.latent_format is not None:
            if vae is None:
                logging.warning("WARNING: no VAE provided to the controlnet apply node when this controlnet requires one.")
            self.vae = vae
        self.extra_concat_orig = extra_concat.copy()
        if self.concat_mask and len(self.extra_concat_orig) == 0:
            self.extra_concat_orig.append(torch.tensor([[[[1.0]]]]))
        return self
    def pre_run(self, model, percent_to_timestep_function):
@@ -71,9 +108,9 @@ class ControlBase:
    def cleanup(self):
        if self.previous_controlnet is not None:
            self.previous_controlnet.cleanup()
-        if self.cond_hint is not None:
+
-            del self.cond_hint
+        self.cond_hint = None
-            self.cond_hint = None
+        self.extra_concat = None
        self.timestep_range = None
    def get_models(self):
@@ -90,7 +127,12 @@ class ControlBase:
        c.compression_ratio = self.compression_ratio
        c.upscale_algorithm = self.upscale_algorithm
        c.latent_format = self.latent_format
        c.extra_args = self.extra_args.copy()
        c.vae = self.vae
        c.extra_conds = self.extra_conds.copy()
        c.strength_type = self.strength_type
        c.concat_mask = self.concat_mask
        c.extra_concat_orig = self.extra_concat_orig.copy()
    def inference_memory_requirements(self, dtype):
        if self.previous_controlnet is not None:
@@ -111,9 +153,12 @@ class ControlBase:
                    if x not in applied_to: #memory saving strategy, allow shared tensors and only apply strength to shared tensors once
                        applied_to.add(x)
-                        x *= self.strength
+                        if self.strength_type == StrengthType.CONSTANT:
                            x *= self.strength
                        elif self.strength_type == StrengthType.LINEAR_UP:
                            x *= (self.strength ** float(len(control_output) - i))
-                    if x.dtype != output_dtype:
+                    if output_dtype is not None and x.dtype != output_dtype:
                        x = x.to(output_dtype)
                out[key].append(x)
@@ -135,8 +180,12 @@ class ControlBase:
                                o[i] = prev_val + o[i] #TODO: change back to inplace add if shared tensors stop being an issue
        return out
    def set_extra_arg(self, argument, value=None):
        self.extra_args[argument] = value
 class ControlNet(ControlBase):
-    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None):
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False):
        super().__init__(device)
        self.control_model = control_model
        self.load_device = load_device
@@ -148,6 +197,9 @@ class ControlNet(ControlBase):
        self.model_sampling_current = None
        self.manual_cast_dtype = manual_cast_dtype
        self.latent_format = latent_format
        self.extra_conds += extra_conds
        self.strength_type = strength_type
        self.concat_mask = concat_mask
    def get_control(self, x_noisy, t, cond, batched_number):
        control_prev = None
@@ -165,7 +217,6 @@ class ControlNet(ControlBase):
        if self.manual_cast_dtype is not None:
            dtype = self.manual_cast_dtype
        output_dtype = x_noisy.dtype
        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
            if self.cond_hint is not None:
                del self.cond_hint
@@ -173,6 +224,9 @@ class ControlNet(ControlBase):
            compression_ratio = self.compression_ratio
            if self.vae is not None:
                compression_ratio *= self.vae.downscale_ratio
            else:
                if self.latent_format is not None:
                    raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center")
            if self.vae is not None:
                loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
@@ -180,19 +234,30 @@ class ControlNet(ControlBase):
                comfy.model_management.load_models_gpu(loaded_models)
            if self.latent_format is not None:
                self.cond_hint = self.latent_format.process_in(self.cond_hint)
            if len(self.extra_concat_orig) > 0:
                to_concat = []
                for c in self.extra_concat_orig:
                    c = c.to(self.cond_hint.device)
                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
                    to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
                self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
            self.cond_hint = self.cond_hint.to(device=self.device, dtype=dtype)
        if x_noisy.shape[0] != self.cond_hint.shape[0]:
            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
        context = cond.get('crossattn_controlnet', cond['c_crossattn'])
-        y = cond.get('y', None)
+        extra = self.extra_args.copy()
-        if y is not None:
+        for c in self.extra_conds:
-            y = y.to(dtype)
+            temp = cond.get(c, None)
            if temp is not None:
                extra[c] = temp.to(dtype)
        timestep = self.model_sampling_current.timestep(t)
        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)
-        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(), context=context.to(dtype), y=y)
+        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra)
-        return self.control_merge(control, control_prev, output_dtype)
+        return self.control_merge(control, control_prev, output_dtype=None)
    def copy(self):
        c = ControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
@@ -276,10 +341,11 @@ class ControlLoraOps:
 class ControlLora(ControlNet):
-    def __init__(self, control_weights, global_average_pooling=False, device=None):
+    def __init__(self, control_weights, global_average_pooling=False, device=None, model_options={}): #TODO? model_options
        ControlBase.__init__(self, device)
        self.control_weights = control_weights
        self.global_average_pooling = global_average_pooling
        self.extra_conds += ["y"]
    def pre_run(self, model, percent_to_timestep_function):
        super().pre_run(model, percent_to_timestep_function)
@@ -332,43 +398,114 @@ class ControlLora(ControlNet):
    def inference_memory_requirements(self, dtype):
        return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)
-def load_controlnet_mmdit(sd):
+def controlnet_config(sd, model_options={}):
-    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config = comfy.model_detection.model_config_from_unet(sd, "", True)
    model_config = comfy.model_detection.model_config_from_unet(new_sd, "", True)
    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
    for k in sd:
        new_sd[k] = sd[k]
-    supported_inference_dtypes = model_config.supported_inference_dtypes
+    unet_dtype = model_options.get("dtype", None)
    if unet_dtype is None:
        weight_dtype = comfy.utils.weight_dtype(sd)
        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
        if weight_dtype is not None:
            supported_inference_dtypes.append(weight_dtype)
        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)
    controlnet_config = model_config.unet_config
    unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)
    load_device = comfy.model_management.get_torch_device()
    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
    if manual_cast_dtype is not None:
        operations = comfy.ops.manual_cast
    else:
        operations = comfy.ops.disable_weight_init
-    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=load_device, dtype=unet_dtype, **controlnet_config)
+    operations = model_options.get("custom_operations", None)
-    missing, unexpected = control_model.load_state_dict(new_sd, strict=False)
+    if operations is None:
        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
    offload_device = comfy.model_management.unet_offload_device()
    return model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device
 def controlnet_load_state_dict(control_model, sd):
    missing, unexpected = control_model.load_state_dict(sd, strict=False)
    if len(missing) > 0:
        logging.warning("missing controlnet keys: {}".format(missing))
    if len(unexpected) > 0:
        logging.debug("unexpected controlnet keys: {}".format(unexpected))
    return control_model
 def load_controlnet_mmdit(sd, model_options={}):
    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
    for k in sd:
        new_sd[k] = sd[k]
    concat_mask = False
    control_latent_channels = new_sd.get("pos_embed_input.proj.weight").shape[1]
    if control_latent_channels == 17: #inpaint controlnet
        concat_mask = True
    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
    control_model = controlnet_load_state_dict(control_model, new_sd)
    latent_format = comfy.latent_formats.SD3()
    latent_format.shift_factor = 0 #SD3 controlnet weirdness
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
    return control
-def load_controlnet(ckpt_path, model=None):
+def load_controlnet_hunyuandit(controlnet_data, model_options={}):
-    controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data, model_options=model_options)
    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=offload_device, dtype=unet_dtype)
    control_model = controlnet_load_state_dict(control_model, controlnet_data)
    latent_format = comfy.latent_formats.SDXL()
    extra_conds = ['text_embedding_mask', 'encoder_hidden_states_t5', 'text_embedding_mask_t5', 'image_meta_size', 'style', 'cos_cis_img', 'sin_cis_img']
    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds, strength_type=StrengthType.CONSTANT)
    return control
 def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}):
    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
    control_model = controlnet_load_state_dict(control_model, sd)
    extra_conds = ['y', 'guidance']
    control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
    return control
 def load_controlnet_flux_instantx(sd, model_options={}):
    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
    for k in sd:
        new_sd[k] = sd[k]
    num_union_modes = 0
    union_cnet = "controlnet_mode_embedder.weight"
    if union_cnet in new_sd:
        num_union_modes = new_sd[union_cnet].shape[0]
    control_latent_channels = new_sd.get("pos_embed_input.weight").shape[1] // 4
    concat_mask = False
    if control_latent_channels == 17:
        concat_mask = True
    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(latent_input=True, num_union_modes=num_union_modes, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
    control_model = controlnet_load_state_dict(control_model, new_sd)
    latent_format = comfy.latent_formats.Flux()
    extra_conds = ['y', 'guidance']
    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
    return control
 def convert_mistoline(sd):
    return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})
 def load_controlnet_state_dict(state_dict, model=None, model_options={}):
    controlnet_data = state_dict
    if 'after_proj_list.18.bias' in controlnet_data.keys(): #Hunyuan DiT
        return load_controlnet_hunyuandit(controlnet_data, model_options=model_options)
    if "lora_controlnet" in controlnet_data:
-        return ControlLora(controlnet_data)
+        return ControlLora(controlnet_data, model_options=model_options)
    controlnet_config = None
    supported_inference_dtypes = None
@@ -414,7 +551,7 @@ def load_controlnet(ckpt_path, model=None):
                new_sd[diffusers_keys[k]] = controlnet_data.pop(k)
        if "control_add_embedding.linear_1.bias" in controlnet_data: #Union Controlnet
-            controlnet_config["union_controlnet"] = True
+            controlnet_config["union_controlnet_num_control_type"] = controlnet_data["task_embedding"].shape[0]
            for k in list(controlnet_data.keys()):
                new_k = k.replace('.attn.in_proj_', '.attn.in_proj.')
                new_sd[new_k] = controlnet_data.pop(k)
@@ -423,8 +560,15 @@ def load_controlnet(ckpt_path, model=None):
        if len(leftover_keys) > 0:
            logging.warning("leftover keys: {}".format(leftover_keys))
        controlnet_data = new_sd
-    elif "controlnet_blocks.0.weight" in controlnet_data: #SD3 diffusers format
+    elif "controlnet_blocks.0.weight" in controlnet_data:
-        return load_controlnet_mmdit(controlnet_data)
+        if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
            return load_controlnet_flux_xlabs_mistoline(controlnet_data, model_options=model_options)
        elif "pos_embed_input.proj.weight" in controlnet_data:
            return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
        elif "controlnet_x_embedder.weight" in controlnet_data:
            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
        return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)
    pth_key = 'control_model.zero_convs.0.0.weight'
    pth = False
@@ -436,26 +580,38 @@ def load_controlnet(ckpt_path, model=None):
    elif key in controlnet_data:
        prefix = ""
    else:
-        net = load_t2i_adapter(controlnet_data)
+        net = load_t2i_adapter(controlnet_data, model_options=model_options)
        if net is None:
-            logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
+            logging.error("error could not detect control model type.")
        return net
    if controlnet_config is None:
        model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
-        supported_inference_dtypes = model_config.supported_inference_dtypes
+        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
        controlnet_config = model_config.unet_config
    unet_dtype = model_options.get("dtype", None)
    if unet_dtype is None:
        weight_dtype = comfy.utils.weight_dtype(controlnet_data)
        if supported_inference_dtypes is None:
            supported_inference_dtypes = [comfy.model_management.unet_dtype()]
        if weight_dtype is not None:
            supported_inference_dtypes.append(weight_dtype)
        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)
    load_device = comfy.model_management.get_torch_device()
    if supported_inference_dtypes is None:
        unet_dtype = comfy.model_management.unet_dtype()
    else:
        unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)
    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
-    if manual_cast_dtype is not None:
+    operations = model_options.get("custom_operations", None)
-        controlnet_config["operations"] = comfy.ops.manual_cast
+    if operations is None:
        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype)
    controlnet_config["operations"] = operations
    controlnet_config["dtype"] = unet_dtype
    controlnet_config["device"] = comfy.model_management.unet_offload_device()
    controlnet_config.pop("out_channels")
    controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
    control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
@@ -489,14 +645,21 @@ def load_controlnet(ckpt_path, model=None):
    if len(unexpected) > 0:
        logging.debug("unexpected controlnet keys: {}".format(unexpected))
-    global_average_pooling = False
+    global_average_pooling = model_options.get("global_average_pooling", False)
    filename = os.path.splitext(ckpt_path)[0]
    if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
        global_average_pooling = True
    control = ControlNet(control_model, global_average_pooling=global_average_pooling, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
    return control
 def load_controlnet(ckpt_path, model=None, model_options={}):
    if "global_average_pooling" not in model_options:
        filename = os.path.splitext(ckpt_path)[0]
        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
            model_options["global_average_pooling"] = True
    cnet = load_controlnet_state_dict(comfy.utils.load_torch_file(ckpt_path, safe_load=True), model=model, model_options=model_options)
    if cnet is None:
        logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
    return cnet
 class T2IAdapter(ControlBase):
    def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
        super().__init__(device)
@@ -552,7 +715,7 @@ class T2IAdapter(ControlBase):
        self.copy_to(c)
        return c
-def load_t2i_adapter(t2i_data):
+def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
    compression_ratio = 8
    upscale_algorithm = 'nearest-exact'
--- a/comfy/diffusers_load.py
+++ b/comfy/diffusers_load.py
@@ -22,7 +22,7 @@ def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_dire
    if text_encoder2_path is not None:
        text_encoder_paths.append(text_encoder2_path)
-    unet = comfy.sd.load_unet(unet_path)
+    unet = comfy.sd.load_diffusion_model(unet_path)
    clip = None
    if output_clip:
--- a/comfy/float.py
+++ b/comfy/float.py
@@ -0,0 +1,67 @@
 import torch
 def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
    mantissa_scaled = torch.where(
        normal_mask,
        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
    )
    mantissa_scaled += torch.rand(mantissa_scaled.size(), dtype=mantissa_scaled.dtype, layout=mantissa_scaled.layout, device=mantissa_scaled.device, generator=generator)
    return mantissa_scaled.floor() / (2**MANTISSA_BITS)
 #Not 100% sure about this
 def manual_stochastic_round_to_float8(x, dtype, generator=None):
    if dtype == torch.float8_e4m3fn:
        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 4, 3, 7
    elif dtype == torch.float8_e5m2:
        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 5, 2, 15
    else:
        raise ValueError("Unsupported dtype")
    x = x.half()
    sign = torch.sign(x)
    abs_x = x.abs()
    sign = torch.where(abs_x == 0, 0, sign)
    # Combine exponent calculation and clamping
    exponent = torch.clamp(
        torch.floor(torch.log2(abs_x)) + EXPONENT_BIAS,
        0, 2**EXPONENT_BITS - 1
    )
    # Combine mantissa calculation and rounding
    normal_mask = ~(exponent == 0)
    abs_x[:] = calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=generator)
    sign *= torch.where(
        normal_mask,
        (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + abs_x),
        (2.0 ** (-EXPONENT_BIAS + 1)) * abs_x
    )
    inf = torch.finfo(dtype)
    torch.clamp(sign, min=inf.min, max=inf.max, out=sign)
    return sign
 def stochastic_rounding(value, dtype, seed=0):
    if dtype == torch.float32:
        return value.to(dtype=torch.float32)
    if dtype == torch.float16:
        return value.to(dtype=torch.float16)
    if dtype == torch.bfloat16:
        return value.to(dtype=torch.bfloat16)
    if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
        generator = torch.Generator(device=value.device)
        generator.manual_seed(seed)
        output = torch.empty_like(value, dtype=dtype)
        num_slices = max(1, (value.numel() / (4096 * 4096)))
        slice_size = max(1, round(value.shape[0] / num_slices))
        for i in range(0, value.shape[0], slice_size):
            output[i:i+slice_size].copy_(manual_stochastic_round_to_float8(value[i:i+slice_size], dtype, generator=generator))
        return output
    return value.to(dtype=dtype)
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -9,6 +9,7 @@ from tqdm.auto import trange, tqdm
 from . import utils
 from . import deis
 import comfy.model_patcher
 import comfy.model_sampling
 def append_zero(x):
    return torch.cat([x, x.new_zeros([1])])
@@ -43,6 +44,17 @@ def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
    return append_zero(sigmas)
 def get_sigmas_laplace(n, sigma_min, sigma_max, mu=0., beta=0.5, device='cpu'):
    """Constructs the noise schedule proposed by Tiankai et al. (2024). """
    epsilon = 1e-5 # avoid log(0)
    x = torch.linspace(0, 1, n, device=device)
    clamp = lambda x: torch.clamp(x, min=sigma_min, max=sigma_max)
    lmb = mu - beta * torch.sign(0.5-x) * torch.log(1 - 2 * torch.abs(0.5-x) + epsilon)
    sigmas = clamp(torch.exp(lmb))
    return sigmas
 def to_d(x, sigma, denoised):
    """Converts a denoiser output to a Karras ODE derivative."""
    return (x - denoised) / utils.append_dims(sigma, x.ndim)
@@ -509,6 +521,9 @@ def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callbac
@torch.no_grad()
 def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if isinstance(model.inner_model.inner_model.model_sampling, comfy.model_sampling.CONST):
        return sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
@@ -541,6 +556,55 @@ def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None,
    return x
@torch.no_grad()
 def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
    lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
    # logged_x = x.unsqueeze(0)
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        downstep_ratio = 1 + (sigmas[i+1]/sigmas[i] - 1) * eta
        sigma_down = sigmas[i+1] * downstep_ratio
        alpha_ip1 = 1 - sigmas[i+1]
        alpha_down = 1 - sigma_down
        renoise_coeff = (sigmas[i+1]**2 - sigma_down**2*alpha_ip1**2/alpha_down**2)**0.5
        # sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
            # Euler method
            d = to_d(x, sigmas[i], denoised)
            dt = sigma_down - sigmas[i]
            x = x + d * dt
        else:
            # DPM-Solver++(2S)
            if sigmas[i] == 1.0:
                sigma_s = 0.9999
            else:
                t_i, t_down = lambda_fn(sigmas[i]), lambda_fn(sigma_down)
                r = 1 / 2
                h = t_down - t_i
                s = t_i + r * h
                sigma_s = sigma_fn(s)
            # sigma_s = sigmas[i+1]
            sigma_s_i_ratio = sigma_s / sigmas[i]
            u = sigma_s_i_ratio * x + (1 - sigma_s_i_ratio) * denoised
            D_i = model(u, sigma_s * s_in, **extra_args)
            sigma_down_i_ratio = sigma_down / sigmas[i]
            x = sigma_down_i_ratio * x + (1 - sigma_down_i_ratio) * D_i
            # print("sigma_i", sigmas[i], "sigma_ip1", sigmas[i+1],"sigma_down", sigma_down, "sigma_down_i_ratio", sigma_down_i_ratio, "sigma_s_i_ratio", sigma_s_i_ratio, "renoise_coeff", renoise_coeff)
        # Noise addition
        if sigmas[i + 1] > 0 and eta > 0:
            x = (alpha_ip1/alpha_down) * x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * renoise_coeff
        # logged_x = torch.cat((logged_x, x.unsqueeze(0)), dim=0)
    return x
@torch.no_grad()
 def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    """DPM-Solver++ (stochastic)."""
@@ -1016,7 +1080,6 @@ def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disabl
        d = to_d(x, sigma_hat, temp[0])
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
        dt = sigmas[i + 1] - sigma_hat
        # Euler method
        x = denoised + d * sigmas[i + 1]
    return x
@@ -1043,8 +1106,81 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        d = to_d(x, sigmas[i], temp[0])
        # Euler method
        dt = sigma_down - sigmas[i]
        x = denoised + d * sigma_down
        if sigmas[i + 1] > 0:
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x
@torch.no_grad()
 def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
    temp = [0]
    def post_cfg_function(args):
        temp[0] = args["uncond_denoised"]
        return args["denoised"]
    model_options = extra_args.get("model_options", {}).copy()
    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigma_down == 0:
            # Euler method
            d = to_d(x, sigmas[i], temp[0])
            x = denoised + d * sigma_down
        else:
            # DPM-Solver++(2S)
            t, t_next = t_fn(sigmas[i]), t_fn(sigma_down)
            # r = torch.sinh(1 + (2 - eta) * (t_next - t) / (t - t_fn(sigma_up))) works only on non-cfgpp, weird
            r = 1 / 2
            h = t_next - t
            s = t + r * h
            x_2 = (sigma_fn(s) / sigma_fn(t)) * (x + (denoised - temp[0])) - (-h * r).expm1() * denoised
            denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
            x = (sigma_fn(t_next) / sigma_fn(t)) * (x + (denoised - temp[0])) - (-h).expm1() * denoised_2
        # Noise addition
        if sigmas[i + 1] > 0:
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x
@torch.no_grad()
 def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
    """DPM-Solver++(2M)."""
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    t_fn = lambda sigma: sigma.log().neg()
    old_uncond_denoised = None
    uncond_denoised = None
    def post_cfg_function(args):
        nonlocal uncond_denoised
        uncond_denoised = args["uncond_denoised"]
        return args["denoised"]
    model_options = extra_args.get("model_options", {}).copy()
    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
        h = t_next - t
        if old_uncond_denoised is None or sigmas[i + 1] == 0:
            denoised_mix = -torch.exp(-h) * uncond_denoised
        else:
            h_last = t - t_fn(sigmas[i - 1])
            r = h_last / h
            denoised_mix = -torch.exp(-h) * uncond_denoised - torch.expm1(-h) * (1 / (2 * r)) * (denoised - old_uncond_denoised)
        x = denoised + denoised_mix + torch.exp(-h) * x
        old_uncond_denoised = uncond_denoised
    return x
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -4,6 +4,7 @@ class LatentFormat:
    scale_factor = 1.0
    latent_channels = 4
    latent_rgb_factors = None
    latent_rgb_factors_bias = None
    taesd_decoder_name = None
    def process_in(self, latent):
@@ -30,11 +31,13 @@ class SDXL(LatentFormat):
    def __init__(self):
        self.latent_rgb_factors = [
                    #   R        G        B
-                    [ 0.3920,  0.4054,  0.4549],
+                    [ 0.3651,  0.4232,  0.4341],
-                    [-0.2634, -0.0196,  0.0653],
+                    [-0.2533, -0.0042,  0.1068],
-                    [ 0.0568,  0.1687, -0.0755],
+                    [ 0.1076,  0.1111, -0.0362],
-                    [-0.3112, -0.2359, -0.2076]
+                    [-0.3165, -0.2492, -0.2188]
                ]
        self.latent_rgb_factors_bias = [ 0.1084, -0.0175, -0.0011]
        self.taesd_decoder_name = "taesdxl_decoder"
 class SDXL_Playground_2_5(LatentFormat):
@@ -112,23 +115,24 @@ class SD3(LatentFormat):
        self.scale_factor = 1.5305
        self.shift_factor = 0.0609
        self.latent_rgb_factors = [
-            [-0.0645,  0.0177,  0.1052],
+            [-0.0922, -0.0175,  0.0749],
-            [ 0.0028,  0.0312,  0.0650],
+            [ 0.0311,  0.0633,  0.0954],
-            [ 0.1848,  0.0762,  0.0360],
+            [ 0.1994,  0.0927,  0.0458],
-            [ 0.0944,  0.0360,  0.0889],
+            [ 0.0856,  0.0339,  0.0902],
-            [ 0.0897,  0.0506, -0.0364],
+            [ 0.0587,  0.0272, -0.0496],
-            [-0.0020,  0.1203,  0.0284],
+            [-0.0006,  0.1104,  0.0309],
-            [ 0.0855,  0.0118,  0.0283],
+            [ 0.0978,  0.0306,  0.0427],
-            [-0.0539,  0.0658,  0.1047],
+            [-0.0042,  0.1038,  0.1358],
-            [-0.0057,  0.0116,  0.0700],
+            [-0.0194,  0.0020,  0.0669],
-            [-0.0412,  0.0281, -0.0039],
+            [-0.0488,  0.0130, -0.0268],
-            [ 0.1106,  0.1171,  0.1220],
+            [ 0.0922,  0.0988,  0.0951],
-            [-0.0248,  0.0682, -0.0481],
+            [-0.0278,  0.0524, -0.0542],
-            [ 0.0815,  0.0846,  0.1207],
+            [ 0.0332,  0.0456,  0.0895],
-            [-0.0120, -0.0055, -0.0867],
+            [-0.0069, -0.0030, -0.0810],
-            [-0.0749, -0.0634, -0.0456],
+            [-0.0596, -0.0465, -0.0293],
-            [-0.1418, -0.1457, -0.1259]
+            [-0.1448, -0.1463, -0.1189]
        ]
        self.latent_rgb_factors_bias = [0.2394, 0.2135, 0.1925]
        self.taesd_decoder_name = "taesd3_decoder"
    def process_in(self, latent):
@@ -139,3 +143,35 @@ class SD3(LatentFormat):
 class StableAudio1(LatentFormat):
    latent_channels = 64
 class Flux(SD3):
    latent_channels = 16
    def __init__(self):
        self.scale_factor = 0.3611
        self.shift_factor = 0.1159
        self.latent_rgb_factors =[
            [-0.0346,  0.0244,  0.0681],
            [ 0.0034,  0.0210,  0.0687],
            [ 0.0275, -0.0668, -0.0433],
            [-0.0174,  0.0160,  0.0617],
            [ 0.0859,  0.0721,  0.0329],
            [ 0.0004,  0.0383,  0.0115],
            [ 0.0405,  0.0861,  0.0915],
            [-0.0236, -0.0185, -0.0259],
            [-0.0245,  0.0250,  0.1180],
            [ 0.1008,  0.0755, -0.0421],
            [-0.0515,  0.0201,  0.0011],
            [ 0.0428, -0.0012, -0.0036],
            [ 0.0817,  0.0765,  0.0749],
            [-0.1264, -0.0522, -0.1103],
            [-0.0280, -0.0881, -0.0499],
            [-0.1262, -0.0982, -0.0778]
        ]
        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
        self.taesd_decoder_name = "taef1_decoder"
    def process_in(self, latent):
        return (latent - self.shift_factor) * self.scale_factor
    def process_out(self, latent):
        return (latent / self.scale_factor) + self.shift_factor
--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@@ -9,6 +9,7 @@ from einops import rearrange
 from torch import nn
 from torch.nn import functional as F
 import math
 import comfy.ops
 class FourierFeatures(nn.Module):
    def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
@@ -18,7 +19,7 @@ class FourierFeatures(nn.Module):
            [out_features // 2, in_features], dtype=dtype, device=device))
    def forward(self, input):
-        f = 2 * math.pi * input @ self.weight.T.to(dtype=input.dtype, device=input.device)
+        f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
        return torch.cat([f.cos(), f.sin()], dim=-1)
 # norms
@@ -38,9 +39,9 @@ class LayerNorm(nn.Module):
    def forward(self, x):
        beta = self.beta
-        if self.beta is not None:
+        if beta is not None:
-            beta = beta.to(dtype=x.dtype, device=x.device)
+            beta = comfy.ops.cast_to_input(beta, x)
-        return F.layer_norm(x, x.shape[-1:], weight=self.gamma.to(dtype=x.dtype, device=x.device), bias=beta)
+        return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)
 class GLU(nn.Module):
    def __init__(
@@ -123,7 +124,9 @@ class RotaryEmbedding(nn.Module):
        scale_base = 512,
        interpolation_factor = 1.,
        base = 10000,
-        base_rescale_factor = 1.
+        base_rescale_factor = 1.,
        dtype=None,
        device=None,
    ):
        super().__init__()
        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
@@ -131,8 +134,8 @@ class RotaryEmbedding(nn.Module):
        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
        base *= base_rescale_factor ** (dim / (dim - 2))
-        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        # inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
+        self.register_buffer('inv_freq', torch.empty((dim // 2,), device=device, dtype=dtype))
        assert interpolation_factor >= 1.
        self.interpolation_factor = interpolation_factor
@@ -161,14 +164,14 @@ class RotaryEmbedding(nn.Module):
        t = t / self.interpolation_factor
-        freqs = torch.einsum('i , j -> i j', t, self.inv_freq.to(dtype=dtype, device=device))
+        freqs = torch.einsum('i , j -> i j', t, comfy.ops.cast_to_input(self.inv_freq, t))
        freqs = torch.cat((freqs, freqs), dim = -1)
        if self.scale is None:
            return freqs, 1.
        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
-        scale = self.scale.to(dtype=dtype, device=device) ** rearrange(power, 'n -> n 1')
+        scale = comfy.ops.cast_to_input(self.scale, t) ** rearrange(power, 'n -> n 1')
        scale = torch.cat((scale, scale), dim = -1)
        return freqs, scale
@@ -568,7 +571,7 @@ class ContinuousTransformer(nn.Module):
        self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()
        if rotary_pos_emb:
-            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32))
+            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32), device=device, dtype=dtype)
        else:
            self.rotary_pos_emb = None
--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@@ -8,6 +8,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.ops
 import comfy.ldm.common_dit
 def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
@@ -406,10 +408,7 @@ class MMDiT(nn.Module):
    def patchify(self, x):
        B, C, H, W = x.size()
-        pad_h = (self.patch_size - H % self.patch_size) % self.patch_size
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
        pad_w = (self.patch_size - W % self.patch_size) % self.patch_size
        x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
        x = x.view(
            B,
            C,
@@ -427,7 +426,7 @@ class MMDiT(nn.Module):
        max_dim = max(h, w)
        cur_dim = self.h_max
-        pos_encoding = self.positional_encoding.reshape(1, cur_dim, cur_dim, -1).to(device=x.device, dtype=x.dtype)
+        pos_encoding = comfy.ops.cast_to_input(self.positional_encoding.reshape(1, cur_dim, cur_dim, -1), x)
        if max_dim > cur_dim:
            pos_encoding = F.interpolate(pos_encoding.movedim(-1, 1), (max_dim, max_dim), mode="bilinear").movedim(1, -1)
@@ -455,7 +454,7 @@ class MMDiT(nn.Module):
        t = timestep
        c = self.cond_seq_linear(c_seq)  # B, T_c, D
-        c = torch.cat([self.register_tokens.to(device=c.device, dtype=c.dtype).repeat(c.size(0), 1, 1), c], dim=1)
+        c = torch.cat([comfy.ops.cast_to_input(self.register_tokens, c).repeat(c.size(0), 1, 1), c], dim=1)
        global_cond = self.t_embedder(t, x.dtype)  # B, D
--- a/comfy/ldm/cascade/common.py
+++ b/comfy/ldm/cascade/common.py
@@ -19,14 +19,7 @@
 import torch
 import torch.nn as nn
 from comfy.ldm.modules.attention import optimized_attention
-
+import comfy.ops
 class Linear(torch.nn.Linear):
    def reset_parameters(self):
        return None
 class Conv2d(torch.nn.Conv2d):
    def reset_parameters(self):
        return None
 class OptimizedAttention(nn.Module):
    def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
@@ -78,13 +71,13 @@ class GlobalResponseNorm(nn.Module):
    "from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105"
    def __init__(self, dim, dtype=None, device=None):
        super().__init__()
-        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
+        self.gamma = nn.Parameter(torch.empty(1, 1, 1, dim, dtype=dtype, device=device))
-        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
+        self.beta = nn.Parameter(torch.empty(1, 1, 1, dim, dtype=dtype, device=device))
    def forward(self, x):
        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
-        return self.gamma.to(device=x.device, dtype=x.dtype) * (x * Nx) + self.beta.to(device=x.device, dtype=x.dtype) + x
+        return comfy.ops.cast_to_input(self.gamma, x) * (x * Nx) + comfy.ops.cast_to_input(self.beta, x) + x
 class ResBlock(nn.Module):
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@@ -0,0 +1,21 @@
 import torch
 import comfy.ops
 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
    if padding_mode == "circular" and torch.jit.is_tracing() or torch.jit.is_scripting():
        padding_mode = "reflect"
    pad_h = (patch_size[0] - img.shape[-2] % patch_size[0]) % patch_size[0]
    pad_w = (patch_size[1] - img.shape[-1] % patch_size[1]) % patch_size[1]
    return torch.nn.functional.pad(img, (0, pad_w, 0, pad_h), mode=padding_mode)
 try:
    rms_norm_torch = torch.nn.functional.rms_norm
 except:
    rms_norm_torch = None
 def rms_norm(x, weight, eps=1e-6):
    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
        return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
    else:
        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
        return (x * rrms) * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
--- a/comfy/ldm/flux/controlnet.py
+++ b/comfy/ldm/flux/controlnet.py
@@ -0,0 +1,205 @@
 #Original code can be found on: https://github.com/XLabs-AI/x-flux/blob/main/src/flux/controlnet.py
 #modified to support different types of flux controlnets
 import torch
 import math
 from torch import Tensor, nn
 from einops import rearrange, repeat
 from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
                                 MLPEmbedder, SingleStreamBlock,
                                 timestep_embedding)
 from .model import Flux
 import comfy.ldm.common_dit
 class MistolineCondDownsamplBlock(nn.Module):
    def __init__(self, dtype=None, device=None, operations=None):
        super().__init__()
        self.encoder = nn.Sequential(
            operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
        )
    def forward(self, x):
        return self.encoder(x)
 class MistolineControlnetBlock(nn.Module):
    def __init__(self, hidden_size, dtype=None, device=None, operations=None):
        super().__init__()
        self.linear = operations.Linear(hidden_size, hidden_size, dtype=dtype, device=device)
        self.act = nn.SiLU()
    def forward(self, x):
        return self.act(self.linear(x))
 class ControlNetFlux(Flux):
    def __init__(self, latent_input=False, num_union_modes=0, mistoline=False, control_latent_channels=None, image_model=None, dtype=None, device=None, operations=None, **kwargs):
        super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs)
        self.main_model_double = 19
        self.main_model_single = 38
        self.mistoline = mistoline
        # add ControlNet blocks
        if self.mistoline:
            control_block = lambda : MistolineControlnetBlock(self.hidden_size, dtype=dtype, device=device, operations=operations)
        else:
            control_block = lambda : operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
        self.controlnet_blocks = nn.ModuleList([])
        for _ in range(self.params.depth):
            self.controlnet_blocks.append(control_block())
        self.controlnet_single_blocks = nn.ModuleList([])
        for _ in range(self.params.depth_single_blocks):
            self.controlnet_single_blocks.append(control_block())
        self.num_union_modes = num_union_modes
        self.controlnet_mode_embedder = None
        if self.num_union_modes > 0:
            self.controlnet_mode_embedder = operations.Embedding(self.num_union_modes, self.hidden_size, dtype=dtype, device=device)
        self.gradient_checkpointing = False
        self.latent_input = latent_input
        if control_latent_channels is None:
            control_latent_channels = self.in_channels
        else:
            control_latent_channels *= 2 * 2 #patch size
        self.pos_embed_input = operations.Linear(control_latent_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
        if not self.latent_input:
            if self.mistoline:
                self.input_cond_block = MistolineCondDownsamplBlock(dtype=dtype, device=device, operations=operations)
            else:
                self.input_hint_block = nn.Sequential(
                    operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
                    nn.SiLU(),
                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
                    nn.SiLU(),
                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
                    nn.SiLU(),
                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
                    nn.SiLU(),
                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
                    nn.SiLU(),
                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
                    nn.SiLU(),
                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
                    nn.SiLU(),
                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
                )
    def forward_orig(
        self,
        img: Tensor,
        img_ids: Tensor,
        controlnet_cond: Tensor,
        txt: Tensor,
        txt_ids: Tensor,
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
        control_type: Tensor = None,
    ) -> Tensor:
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
        # running on sequences img
        img = self.img_in(img)
        controlnet_cond = self.pos_embed_input(controlnet_cond)
        img = img + controlnet_cond
        vec = self.time_in(timestep_embedding(timesteps, 256))
        if self.params.guidance_embed:
            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
        vec = vec + self.vector_in(y)
        txt = self.txt_in(txt)
        if self.controlnet_mode_embedder is not None and len(control_type) > 0:
            control_cond = self.controlnet_mode_embedder(torch.tensor(control_type, device=img.device), out_dtype=img.dtype).unsqueeze(0).repeat((txt.shape[0], 1, 1))
            txt = torch.cat([control_cond, txt], dim=1)
            txt_ids = torch.cat([txt_ids[:,:1], txt_ids], dim=1)
        ids = torch.cat((txt_ids, img_ids), dim=1)
        pe = self.pe_embedder(ids)
        controlnet_double = ()
        for i in range(len(self.double_blocks)):
            img, txt = self.double_blocks[i](img=img, txt=txt, vec=vec, pe=pe)
            controlnet_double = controlnet_double + (self.controlnet_blocks[i](img),)
        img = torch.cat((txt, img), 1)
        controlnet_single = ()
        for i in range(len(self.single_blocks)):
            img = self.single_blocks[i](img, vec=vec, pe=pe)
            controlnet_single = controlnet_single + (self.controlnet_single_blocks[i](img[:, txt.shape[1] :, ...]),)
        repeat = math.ceil(self.main_model_double / len(controlnet_double))
        if self.latent_input:
            out_input = ()
            for x in controlnet_double:
                    out_input += (x,) * repeat
        else:
            out_input = (controlnet_double * repeat)
        out = {"input": out_input[:self.main_model_double]}
        if len(controlnet_single) > 0:
            repeat = math.ceil(self.main_model_single / len(controlnet_single))
            out_output = ()
            if self.latent_input:
                for x in controlnet_single:
                        out_output += (x,) * repeat
            else:
                out_output = (controlnet_single * repeat)
            out["output"] = out_output[:self.main_model_single]
        return out
    def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
        patch_size = 2
        if self.latent_input:
            hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
        elif self.mistoline:
            hint = hint * 2.0 - 1.0
            hint = self.input_cond_block(hint)
        else:
            hint = hint * 2.0 - 1.0
            hint = self.input_hint_block(hint)
        hint = rearrange(hint, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
        bs, c, h, w = x.shape
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
        h_len = ((h + (patch_size // 2)) // patch_size)
        w_len = ((w + (patch_size // 2)) // patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[..., 1] = img_ids[..., 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype)[:, None]
        img_ids[..., 2] = img_ids[..., 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype)[None, :]
        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        return self.forward_orig(img, img_ids, hint, context, txt_ids, timesteps, y, guidance, control_type=kwargs.get("control_type", []))
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -0,0 +1,249 @@
 import math
 from dataclasses import dataclass
 import torch
 from torch import Tensor, nn
 from .math import attention, rope
 import comfy.ops
 import comfy.ldm.common_dit
 class EmbedND(nn.Module):
    def __init__(self, dim: int, theta: int, axes_dim: list):
        super().__init__()
        self.dim = dim
        self.theta = theta
        self.axes_dim = axes_dim
    def forward(self, ids: Tensor) -> Tensor:
        n_axes = ids.shape[-1]
        emb = torch.cat(
            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
            dim=-3,
        )
        return emb.unsqueeze(1)
 def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
    """
    Create sinusoidal timestep embeddings.
    :param t: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an (N, D) Tensor of positional embeddings.
    """
    t = time_factor * t
    half = dim // 2
    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
    args = t[:, None].float() * freqs[None]
    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
    if dim % 2:
        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
    if torch.is_floating_point(t):
        embedding = embedding.to(t)
    return embedding
 class MLPEmbedder(nn.Module):
    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
        super().__init__()
        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
        self.silu = nn.SiLU()
        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
    def forward(self, x: Tensor) -> Tensor:
        return self.out_layer(self.silu(self.in_layer(x)))
 class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, dtype=None, device=None, operations=None):
        super().__init__()
        self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
    def forward(self, x: Tensor):
        return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6)
 class QKNorm(torch.nn.Module):
    def __init__(self, dim: int, dtype=None, device=None, operations=None):
        super().__init__()
        self.query_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
        self.key_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple:
        q = self.query_norm(q)
        k = self.key_norm(k)
        return q.to(v), k.to(v)
 class SelfAttention(nn.Module):
    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
@dataclass
 class ModulationOut:
    shift: Tensor
    scale: Tensor
    gate: Tensor
 class Modulation(nn.Module):
    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
        super().__init__()
        self.is_double = double
        self.multiplier = 6 if double else 3
        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
    def forward(self, vec: Tensor) -> tuple:
        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
        return (
            ModulationOut(*out[:3]),
            ModulationOut(*out[3:]) if self.is_double else None,
        )
 class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, dtype=None, device=None, operations=None):
        super().__init__()
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.img_mlp = nn.Sequential(
            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
        self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.txt_mlp = nn.Sequential(
            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)
        # prepare image for attention
        img_modulated = self.img_norm1(img)
        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
        # run actual attention
        attn = attention(torch.cat((txt_q, img_q), dim=2),
                         torch.cat((txt_k, img_k), dim=2),
                         torch.cat((txt_v, img_v), dim=2), pe=pe)
        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
        # calculate the img bloks
        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
        # calculate the txt bloks
        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
        return img, txt
 class SingleStreamBlock(nn.Module):
    """
    A DiT block with parallel linear layers as described in
    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
    """
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qk_scale: float = None,
        dtype=None,
        device=None,
        operations=None
    ):
        super().__init__()
        self.hidden_dim = hidden_size
        self.num_heads = num_heads
        head_dim = hidden_size // num_heads
        self.scale = qk_scale or head_dim**-0.5
        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
        # qkv and mlp_in
        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
        # proj and mlp_out
        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
        self.hidden_size = hidden_size
        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
        mod, _ = self.modulation(vec)
        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)
        # compute attention
        attn = attention(q, k, v, pe=pe)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
        x += mod.gate * output
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
 class LastLayer(nn.Module):
    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
        super().__init__()
        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        x = self.linear(x)
        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -0,0 +1,35 @@
 import torch
 from einops import rearrange
 from torch import Tensor
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
    q, k = apply_rope(q, k, pe)
    heads = q.shape[1]
    x = optimized_attention(q, k, v, heads, skip_reshape=True)
    return x
 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu():
        device = torch.device("cpu")
    else:
        device = pos.device
    scale = torch.linspace(0, (dim - 2) / dim, steps=dim//2, dtype=torch.float64, device=device)
    omega = 1.0 / (theta**scale)
    out = torch.einsum("...n,d->...nd", pos.to(dtype=torch.float32, device=device), omega)
    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
    return out.to(dtype=torch.float32, device=pos.device)
 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -0,0 +1,160 @@
 #Original code can be found on: https://github.com/black-forest-labs/flux
 from dataclasses import dataclass
 import torch
 from torch import Tensor, nn
 from .layers import (
    DoubleStreamBlock,
    EmbedND,
    LastLayer,
    MLPEmbedder,
    SingleStreamBlock,
    timestep_embedding,
 )
 from einops import rearrange, repeat
 import comfy.ldm.common_dit
@dataclass
 class FluxParams:
    in_channels: int
    vec_in_dim: int
    context_in_dim: int
    hidden_size: int
    mlp_ratio: float
    num_heads: int
    depth: int
    depth_single_blocks: int
    axes_dim: list
    theta: int
    qkv_bias: bool
    guidance_embed: bool
 class Flux(nn.Module):
    """
    Transformer model for flow matching on sequences.
    """
    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.dtype = dtype
        params = FluxParams(**kwargs)
        self.params = params
        self.in_channels = params.in_channels * 2 * 2
        self.out_channels = self.in_channels
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
            )
        pe_dim = params.hidden_size // params.num_heads
        if sum(params.axes_dim) != pe_dim:
            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
        self.hidden_size = params.hidden_size
        self.num_heads = params.num_heads
        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
        self.guidance_in = (
            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
        )
        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
        self.double_blocks = nn.ModuleList(
            [
                DoubleStreamBlock(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
            ]
        )
        self.single_blocks = nn.ModuleList(
            [
                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )
        if final_layer:
            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
    def forward_orig(
        self,
        img: Tensor,
        img_ids: Tensor,
        txt: Tensor,
        txt_ids: Tensor,
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
        control=None,
    ) -> Tensor:
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
        # running on sequences img
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256).to(img.dtype))
        if self.params.guidance_embed:
            if guidance is None:
                raise ValueError("Didn't get guidance strength for guidance distilled model.")
            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
        txt = self.txt_in(txt)
        ids = torch.cat((txt_ids, img_ids), dim=1)
        pe = self.pe_embedder(ids)
        for i, block in enumerate(self.double_blocks):
            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
            if control is not None: # Controlnet
                control_i = control.get("input")
                if i < len(control_i):
                    add = control_i[i]
                    if add is not None:
                        img += add
        img = torch.cat((txt, img), 1)
        for i, block in enumerate(self.single_blocks):
            img = block(img, vec=vec, pe=pe)
            if control is not None: # Controlnet
                control_o = control.get("output")
                if i < len(control_o):
                    add = control_o[i]
                    if add is not None:
                        img[:, txt.shape[1] :, ...] += add
        img = img[:, txt.shape[1] :, ...]
        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img
    def forward(self, x, timestep, context, y, guidance, control=None, **kwargs):
        bs, c, h, w = x.shape
        patch_size = 2
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
        h_len = ((h + (patch_size // 2)) // patch_size)
        w_len = ((w + (patch_size // 2)) // patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, 1] = torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
        img_ids[:, :, 2] = torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control)
        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/hydit/attn_layers.py
+++ b/comfy/ldm/hydit/attn_layers.py
@@ -0,0 +1,218 @@
 import torch
 import torch.nn as nn
 from typing import Tuple, Union, Optional
 from comfy.ldm.modules.attention import optimized_attention
 def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x: torch.Tensor, head_first=False):
    """
    Reshape frequency tensor for broadcasting it with another tensor.
    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
    for the purpose of broadcasting the frequency tensor during element-wise operations.
    Args:
        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
        x (torch.Tensor): Target tensor for broadcasting compatibility.
        head_first (bool): head dimension first (except batch dim) or not.
    Returns:
        torch.Tensor: Reshaped frequency tensor.
    Raises:
        AssertionError: If the frequency tensor doesn't match the expected shape.
        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
    """
    ndim = x.ndim
    assert 0 <= 1 < ndim
    if isinstance(freqs_cis, tuple):
        # freqs_cis: (cos, sin) in real space
        if head_first:
            assert freqs_cis[0].shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        else:
            assert freqs_cis[0].shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
    else:
        # freqs_cis: values in complex space
        if head_first:
            assert freqs_cis.shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        else:
            assert freqs_cis.shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
        return freqs_cis.view(*shape)
 def rotate_half(x):
    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
 def apply_rotary_emb(
        xq: torch.Tensor,
        xk: Optional[torch.Tensor],
        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
        head_first: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Apply rotary embeddings to input tensors using the given frequency tensor.
    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
    returned as real tensors.
    Args:
        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Precomputed frequency tensor for complex exponentials.
        head_first (bool): head dimension first (except batch dim) or not.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
    """
    xk_out = None
    if isinstance(freqs_cis, tuple):
        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)    # [S, D]
        xq_out = (xq * cos + rotate_half(xq) * sin)
        if xk is not None:
            xk_out = (xk * cos + rotate_half(xk) * sin)
    else:
        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)   # [S, D//2] --> [1, S, 1, D//2]
        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
        if xk is not None:
            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # [B, S, H, D//2]
            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
    return xq_out, xk_out
 class CrossAttention(nn.Module):
    """
    Use QK Normalization.
    """
    def __init__(self,
                 qdim,
                 kdim,
                 num_heads,
                 qkv_bias=True,
                 qk_norm=False,
                 attn_drop=0.0,
                 proj_drop=0.0,
                 attn_precision=None,
                 device=None,
                 dtype=None,
                 operations=None,
                 ):
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.attn_precision = attn_precision
        self.qdim = qdim
        self.kdim = kdim
        self.num_heads = num_heads
        assert self.qdim % num_heads == 0, "self.qdim must be divisible by num_heads"
        self.head_dim = self.qdim // num_heads
        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
        self.scale = self.head_dim ** -0.5
        self.q_proj = operations.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
        self.kv_proj = operations.Linear(kdim, 2 * qdim, bias=qkv_bias, **factory_kwargs)
        # TODO: eps should be 1 / 65530 if using fp16
        self.q_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
        self.k_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.out_proj = operations.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
        self.proj_drop = nn.Dropout(proj_drop)
    def forward(self, x, y, freqs_cis_img=None):
        """
        Parameters
        ----------
        x: torch.Tensor
            (batch, seqlen1, hidden_dim) (where hidden_dim = num heads * head dim)
        y: torch.Tensor
            (batch, seqlen2, hidden_dim2)
        freqs_cis_img: torch.Tensor
            (batch, hidden_dim // 2), RoPE for image
        """
        b, s1, c = x.shape     # [b, s1, D]
        _, s2, c = y.shape     # [b, s2, 1024]
        q = self.q_proj(x).view(b, s1, self.num_heads, self.head_dim)   # [b, s1, h, d]
        kv = self.kv_proj(y).view(b, s2, 2, self.num_heads, self.head_dim)    # [b, s2, 2, h, d]
        k, v = kv.unbind(dim=2) # [b, s, h, d]
        q = self.q_norm(q)
        k = self.k_norm(k)
        # Apply RoPE if needed
        if freqs_cis_img is not None:
            qq, _ = apply_rotary_emb(q, None, freqs_cis_img)
            assert qq.shape == q.shape, f'qq: {qq.shape}, q: {q.shape}'
            q = qq
        q = q.transpose(-2, -3).contiguous()        # q ->  B, L1, H, C - B, H, L1, C
        k = k.transpose(-2, -3).contiguous()      # k ->  B, L2, H, C - B, H, C, L2
        v = v.transpose(-2, -3).contiguous() 
        context = optimized_attention(q, k, v, self.num_heads, skip_reshape=True, attn_precision=self.attn_precision)
        out = self.out_proj(context)  # context.reshape - B, L1, -1
        out = self.proj_drop(out)
        out_tuple = (out,)
        return out_tuple
 class Attention(nn.Module):
    """
    We rename some layer names to align with flash attention
    """
    def __init__(self, dim, num_heads, qkv_bias=True, qk_norm=False, attn_drop=0., proj_drop=0., attn_precision=None, dtype=None, device=None, operations=None):
        super().__init__()
        self.attn_precision = attn_precision
        self.dim = dim
        self.num_heads = num_heads
        assert self.dim % num_heads == 0, 'dim should be divisible by num_heads'
        self.head_dim = self.dim // num_heads
        # This assertion is aligned with flash attention
        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
        self.scale = self.head_dim ** -0.5
        # qkv --> Wqkv
        self.Wqkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
        # TODO: eps should be 1 / 65530 if using fp16
        self.q_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
        self.k_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
        self.attn_drop = nn.Dropout(attn_drop)
        self.out_proj = operations.Linear(dim, dim, dtype=dtype, device=device)
        self.proj_drop = nn.Dropout(proj_drop)
    def forward(self, x, freqs_cis_img=None):
        B, N, C = x.shape
        qkv = self.Wqkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)   # [3, b, h, s, d]
        q, k, v = qkv.unbind(0)     # [b, h, s, d]
        q = self.q_norm(q)          # [b, h, s, d]
        k = self.k_norm(k)          # [b, h, s, d]
        # Apply RoPE if needed
        if freqs_cis_img is not None:
            qq, kk = apply_rotary_emb(q, k, freqs_cis_img, head_first=True)
            assert qq.shape == q.shape and kk.shape == k.shape, \
                f'qq: {qq.shape}, q: {q.shape}, kk: {kk.shape}, k: {k.shape}'
            q, k = qq, kk
        x = optimized_attention(q, k, v, self.num_heads, skip_reshape=True, attn_precision=self.attn_precision)
        x = self.out_proj(x)
        x = self.proj_drop(x)
        out_tuple = (x,)
        return out_tuple
--- a/comfy/ldm/hydit/controlnet.py
+++ b/comfy/ldm/hydit/controlnet.py
@@ -0,0 +1,321 @@
 from typing import Any, Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils import checkpoint
 from comfy.ldm.modules.diffusionmodules.mmdit import (
    Mlp,
    TimestepEmbedder,
    PatchEmbed,
    RMSNorm,
 )
 from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
 from .poolers import AttentionPool
 import comfy.latent_formats
 from .models import HunYuanDiTBlock, calc_rope
 from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop
 class HunYuanControlNet(nn.Module):
    """
    HunYuanDiT: Diffusion model with a Transformer backbone.
    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
    Inherit PeftAdapterMixin to be compatible with the PEFT training pipeline.
    Parameters
    ----------
    args: argparse.Namespace
        The arguments parsed by argparse.
    input_size: tuple
        The size of the input image.
    patch_size: int
        The size of the patch.
    in_channels: int
        The number of input channels.
    hidden_size: int
        The hidden size of the transformer backbone.
    depth: int
        The number of transformer blocks.
    num_heads: int
        The number of attention heads.
    mlp_ratio: float
        The ratio of the hidden size of the MLP in the transformer block.
    log_fn: callable
        The logging function.
    """
    def __init__(
        self,
        input_size: tuple = 128,
        patch_size: int = 2,
        in_channels: int = 4,
        hidden_size: int = 1408,
        depth: int = 40,
        num_heads: int = 16,
        mlp_ratio: float = 4.3637,
        text_states_dim=1024,
        text_states_dim_t5=2048,
        text_len=77,
        text_len_t5=256,
        qk_norm=True,  # See http://arxiv.org/abs/2302.05442 for details.
        size_cond=False,
        use_style_cond=False,
        learn_sigma=True,
        norm="layer",
        log_fn: callable = print,
        attn_precision=None,
        dtype=None,
        device=None,
        operations=None,
        **kwargs,
    ):
        super().__init__()
        self.log_fn = log_fn
        self.depth = depth
        self.learn_sigma = learn_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if learn_sigma else in_channels
        self.patch_size = patch_size
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.text_states_dim = text_states_dim
        self.text_states_dim_t5 = text_states_dim_t5
        self.text_len = text_len
        self.text_len_t5 = text_len_t5
        self.size_cond = size_cond
        self.use_style_cond = use_style_cond
        self.norm = norm
        self.dtype = dtype
        self.latent_format = comfy.latent_formats.SDXL
        self.mlp_t5 = nn.Sequential(
            nn.Linear(
                self.text_states_dim_t5,
                self.text_states_dim_t5 * 4,
                bias=True,
                dtype=dtype,
                device=device,
            ),
            nn.SiLU(),
            nn.Linear(
                self.text_states_dim_t5 * 4,
                self.text_states_dim,
                bias=True,
                dtype=dtype,
                device=device,
            ),
        )
        # learnable replace
        self.text_embedding_padding = nn.Parameter(
            torch.randn(
                self.text_len + self.text_len_t5,
                self.text_states_dim,
                dtype=dtype,
                device=device,
            )
        )
        # Attention pooling
        pooler_out_dim = 1024
        self.pooler = AttentionPool(
            self.text_len_t5,
            self.text_states_dim_t5,
            num_heads=8,
            output_dim=pooler_out_dim,
            dtype=dtype,
            device=device,
            operations=operations,
        )
        # Dimension of the extra input vectors
        self.extra_in_dim = pooler_out_dim
        if self.size_cond:
            # Image size and crop size conditions
            self.extra_in_dim += 6 * 256
        if self.use_style_cond:
            # Here we use a default learned embedder layer for future extension.
            self.style_embedder = nn.Embedding(
                1, hidden_size, dtype=dtype, device=device
            )
            self.extra_in_dim += hidden_size
        # Text embedding for `add`
        self.x_embedder = PatchEmbed(
            input_size,
            patch_size,
            in_channels,
            hidden_size,
            dtype=dtype,
            device=device,
            operations=operations,
        )
        self.t_embedder = TimestepEmbedder(
            hidden_size, dtype=dtype, device=device, operations=operations
        )
        self.extra_embedder = nn.Sequential(
            operations.Linear(
                self.extra_in_dim, hidden_size * 4, dtype=dtype, device=device
            ),
            nn.SiLU(),
            operations.Linear(
                hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device
            ),
        )
        # Image embedding
        num_patches = self.x_embedder.num_patches
        # HUnYuanDiT Blocks
        self.blocks = nn.ModuleList(
            [
                HunYuanDiTBlock(
                    hidden_size=hidden_size,
                    c_emb_size=hidden_size,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    text_states_dim=self.text_states_dim,
                    qk_norm=qk_norm,
                    norm_type=self.norm,
                    skip=False,
                    attn_precision=attn_precision,
                    dtype=dtype,
                    device=device,
                    operations=operations,
                )
                for _ in range(19)
            ]
        )
        # Input zero linear for the first block
        self.before_proj = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
        # Output zero linear for the every block
        self.after_proj_list = nn.ModuleList(
            [
                    operations.Linear(
                        self.hidden_size, self.hidden_size, dtype=dtype, device=device
                    )
                for _ in range(len(self.blocks))
            ]
        )
    def forward(
        self,
        x,
        hint,
        timesteps,
        context,#encoder_hidden_states=None,
        text_embedding_mask=None,
        encoder_hidden_states_t5=None,
        text_embedding_mask_t5=None,
        image_meta_size=None,
        style=None,
        return_dict=False,
        **kwarg,
    ):
        """
        Forward pass of the encoder.
        Parameters
        ----------
        x: torch.Tensor
            (B, D, H, W)
        t: torch.Tensor
            (B)
        encoder_hidden_states: torch.Tensor
            CLIP text embedding, (B, L_clip, D)
        text_embedding_mask: torch.Tensor
            CLIP text embedding mask, (B, L_clip)
        encoder_hidden_states_t5: torch.Tensor
            T5 text embedding, (B, L_t5, D)
        text_embedding_mask_t5: torch.Tensor
            T5 text embedding mask, (B, L_t5)
        image_meta_size: torch.Tensor
            (B, 6)
        style: torch.Tensor
            (B)
        cos_cis_img: torch.Tensor
        sin_cis_img: torch.Tensor
        return_dict: bool
            Whether to return a dictionary.
        """
        condition = hint
        if condition.shape[0] == 1:
            condition = torch.repeat_interleave(condition, x.shape[0], dim=0)
        text_states = context  # 2,77,1024
        text_states_t5 = encoder_hidden_states_t5  # 2,256,2048
        text_states_mask = text_embedding_mask.bool()  # 2,77
        text_states_t5_mask = text_embedding_mask_t5.bool()  # 2,256
        b_t5, l_t5, c_t5 = text_states_t5.shape
        text_states_t5 = self.mlp_t5(text_states_t5.view(-1, c_t5)).view(b_t5, l_t5, -1)
        padding = comfy.ops.cast_to_input(self.text_embedding_padding, text_states)
        text_states[:, -self.text_len :] = torch.where(
            text_states_mask[:, -self.text_len :].unsqueeze(2),
            text_states[:, -self.text_len :],
            padding[: self.text_len],
        )
        text_states_t5[:, -self.text_len_t5 :] = torch.where(
            text_states_t5_mask[:, -self.text_len_t5 :].unsqueeze(2),
            text_states_t5[:, -self.text_len_t5 :],
            padding[self.text_len :],
        )
        text_states = torch.cat([text_states, text_states_t5], dim=1)  # 2,205，1024
        # _, _, oh, ow = x.shape
        # th, tw = oh // self.patch_size, ow // self.patch_size
        # Get image RoPE embedding according to `reso`lution.
        freqs_cis_img = calc_rope(
            x, self.patch_size, self.hidden_size // self.num_heads
        )  # (cos_cis_img, sin_cis_img)
        # ========================= Build time and image embedding =========================
        t = self.t_embedder(timesteps, dtype=self.dtype)
        x = self.x_embedder(x)
        # ========================= Concatenate all extra vectors =========================
        # Build text tokens with pooling
        extra_vec = self.pooler(encoder_hidden_states_t5)
        # Build image meta size tokens if applicable
        # if image_meta_size is not None:
        #     image_meta_size = timestep_embedding(image_meta_size.view(-1), 256)   # [B * 6, 256]
        #     if image_meta_size.dtype != self.dtype:
        #         image_meta_size = image_meta_size.half()
        #     image_meta_size = image_meta_size.view(-1, 6 * 256)
        #     extra_vec = torch.cat([extra_vec, image_meta_size], dim=1)  # [B, D + 6 * 256]
        # Build style tokens
        if style is not None:
            style_embedding = self.style_embedder(style)
            extra_vec = torch.cat([extra_vec, style_embedding], dim=1)
        # Concatenate all extra vectors
        c = t + self.extra_embedder(extra_vec)  # [B, D]
        # ========================= Deal with Condition =========================
        condition = self.x_embedder(condition)
        # ========================= Forward pass through HunYuanDiT blocks =========================
        controls = []
        x = x + self.before_proj(condition)  # add condition
        for layer, block in enumerate(self.blocks):
            x = block(x, c, text_states, freqs_cis_img)
            controls.append(self.after_proj_list[layer](x))  # zero linear for output
        return {"output": controls}
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@@ -0,0 +1,410 @@
 from typing import Any
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import comfy.ops
 from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
 from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
 from torch.utils import checkpoint
 from .attn_layers import Attention, CrossAttention
 from .poolers import AttentionPool
 from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop
 def calc_rope(x, patch_size, head_size):
    th = (x.shape[2] + (patch_size // 2)) // patch_size
    tw = (x.shape[3] + (patch_size // 2)) // patch_size
    base_size = 512 // 8 // patch_size
    start, stop = get_fill_resize_and_crop((th, tw), base_size)
    sub_args = [start, stop, (th, tw)]
    # head_size = HUNYUAN_DIT_CONFIG['DiT-g/2']['hidden_size'] // HUNYUAN_DIT_CONFIG['DiT-g/2']['num_heads']
    rope = get_2d_rotary_pos_embed(head_size, *sub_args)
    rope = (rope[0].to(x), rope[1].to(x))
    return rope
 def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
 class HunYuanDiTBlock(nn.Module):
    """
    A HunYuanDiT block with `add` conditioning.
    """
    def __init__(self,
                 hidden_size,
                 c_emb_size,
                 num_heads,
                 mlp_ratio=4.0,
                 text_states_dim=1024,
                 qk_norm=False,
                 norm_type="layer",
                 skip=False,
                 attn_precision=None,
                 dtype=None,
                 device=None,
                 operations=None,
                 ):
        super().__init__()
        use_ele_affine = True
        if norm_type == "layer":
            norm_layer = operations.LayerNorm
        elif norm_type == "rms":
            norm_layer = RMSNorm
        else:
            raise ValueError(f"Unknown norm_type: {norm_type}")
        # ========================= Self-Attention =========================
        self.norm1 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6, dtype=dtype, device=device)
        self.attn1 = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm, attn_precision=attn_precision, dtype=dtype, device=device, operations=operations)
        # ========================= FFN =========================
        self.norm2 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6, dtype=dtype, device=device)
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0, dtype=dtype, device=device, operations=operations)
        # ========================= Add =========================
        # Simply use add like SDXL.
        self.default_modulation = nn.Sequential(
            nn.SiLU(),
            operations.Linear(c_emb_size, hidden_size, bias=True, dtype=dtype, device=device)
        )
        # ========================= Cross-Attention =========================
        self.attn2 = CrossAttention(hidden_size, text_states_dim, num_heads=num_heads, qkv_bias=True,
                                        qk_norm=qk_norm, attn_precision=attn_precision, dtype=dtype, device=device, operations=operations)
        self.norm3 = norm_layer(hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
        # ========================= Skip Connection =========================
        if skip:
            self.skip_norm = norm_layer(2 * hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
            self.skip_linear = operations.Linear(2 * hidden_size, hidden_size, dtype=dtype, device=device)
        else:
            self.skip_linear = None
        self.gradient_checkpointing = False
    def _forward(self, x, c=None, text_states=None, freq_cis_img=None, skip=None):
        # Long Skip Connection
        if self.skip_linear is not None:
            cat = torch.cat([x, skip], dim=-1)
            if cat.dtype != x.dtype:
                cat = cat.to(x.dtype)
            cat = self.skip_norm(cat)
            x = self.skip_linear(cat)
        # Self-Attention
        shift_msa = self.default_modulation(c).unsqueeze(dim=1)
        attn_inputs = (
            self.norm1(x) + shift_msa, freq_cis_img,
        )
        x = x + self.attn1(*attn_inputs)[0]
        # Cross-Attention
        cross_inputs = (
            self.norm3(x), text_states, freq_cis_img
        )
        x = x + self.attn2(*cross_inputs)[0]
        # FFN Layer
        mlp_inputs = self.norm2(x)
        x = x + self.mlp(mlp_inputs)
        return x
    def forward(self, x, c=None, text_states=None, freq_cis_img=None, skip=None):
        if self.gradient_checkpointing and self.training:
            return checkpoint.checkpoint(self._forward, x, c, text_states, freq_cis_img, skip)
        return self._forward(x, c, text_states, freq_cis_img, skip)
 class FinalLayer(nn.Module):
    """
    The final layer of HunYuanDiT.
    """
    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels, dtype=None, device=None, operations=None):
        super().__init__()
        self.norm_final = operations.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.linear = operations.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            operations.Linear(c_emb_size, 2 * final_hidden_size, bias=True, dtype=dtype, device=device)
        )
    def forward(self, x, c):
        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
        x = modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x
 class HunYuanDiT(nn.Module):
    """
    HunYuanDiT: Diffusion model with a Transformer backbone.
    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
    Inherit PeftAdapterMixin to be compatible with the PEFT training pipeline.
    Parameters
    ----------
    args: argparse.Namespace
        The arguments parsed by argparse.
    input_size: tuple
        The size of the input image.
    patch_size: int
        The size of the patch.
    in_channels: int
        The number of input channels.
    hidden_size: int
        The hidden size of the transformer backbone.
    depth: int
        The number of transformer blocks.
    num_heads: int
        The number of attention heads.
    mlp_ratio: float
        The ratio of the hidden size of the MLP in the transformer block.
    log_fn: callable
        The logging function.
    """
    #@register_to_config
    def __init__(self,
                 input_size: tuple = 32,
                 patch_size: int = 2,
                 in_channels: int = 4,
                 hidden_size: int = 1152,
                 depth: int = 28,
                 num_heads: int = 16,
                 mlp_ratio: float = 4.0,
                 text_states_dim = 1024,
                 text_states_dim_t5 = 2048,
                 text_len = 77,
                 text_len_t5 = 256,
                 qk_norm = True,# See http://arxiv.org/abs/2302.05442 for details.
                 size_cond = False,
                 use_style_cond = False,
                 learn_sigma = True,
                 norm = "layer",
                 log_fn: callable = print,
                 attn_precision=None,
                 dtype=None,
                 device=None,
                 operations=None,
                 **kwargs,
    ):
        super().__init__()
        self.log_fn = log_fn
        self.depth = depth
        self.learn_sigma = learn_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if learn_sigma else in_channels
        self.patch_size = patch_size
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.text_states_dim = text_states_dim
        self.text_states_dim_t5 = text_states_dim_t5
        self.text_len = text_len
        self.text_len_t5 = text_len_t5
        self.size_cond = size_cond
        self.use_style_cond = use_style_cond
        self.norm = norm
        self.dtype = dtype
        #import pdb
        #pdb.set_trace()
        self.mlp_t5 = nn.Sequential(
            operations.Linear(self.text_states_dim_t5, self.text_states_dim_t5 * 4, bias=True, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Linear(self.text_states_dim_t5 * 4, self.text_states_dim, bias=True, dtype=dtype, device=device),
        )
        # learnable replace
        self.text_embedding_padding = nn.Parameter(
            torch.empty(self.text_len + self.text_len_t5, self.text_states_dim, dtype=dtype, device=device))
        # Attention pooling
        pooler_out_dim = 1024
        self.pooler = AttentionPool(self.text_len_t5, self.text_states_dim_t5, num_heads=8, output_dim=pooler_out_dim, dtype=dtype, device=device, operations=operations)
        # Dimension of the extra input vectors
        self.extra_in_dim = pooler_out_dim
        if self.size_cond:
            # Image size and crop size conditions
            self.extra_in_dim += 6 * 256
        if self.use_style_cond:
            # Here we use a default learned embedder layer for future extension.
            self.style_embedder = operations.Embedding(1, hidden_size, dtype=dtype, device=device)
            self.extra_in_dim += hidden_size
        # Text embedding for `add`
        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, dtype=dtype, device=device, operations=operations)
        self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype, device=device, operations=operations)
        self.extra_embedder = nn.Sequential(
            operations.Linear(self.extra_in_dim, hidden_size * 4, dtype=dtype, device=device),
            nn.SiLU(),
            operations.Linear(hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device),
        )
        # Image embedding
        num_patches = self.x_embedder.num_patches
        # HUnYuanDiT Blocks
        self.blocks = nn.ModuleList([
            HunYuanDiTBlock(hidden_size=hidden_size,
                            c_emb_size=hidden_size,
                            num_heads=num_heads,
                            mlp_ratio=mlp_ratio,
                            text_states_dim=self.text_states_dim,
                            qk_norm=qk_norm,
                            norm_type=self.norm,
                            skip=layer > depth // 2,
                            attn_precision=attn_precision,
                            dtype=dtype,
                            device=device,
                            operations=operations,
                            )
            for layer in range(depth)
        ])
        self.final_layer = FinalLayer(hidden_size, hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
        self.unpatchify_channels = self.out_channels
    def forward(self,
                x,
                t,
                context,#encoder_hidden_states=None,
                text_embedding_mask=None,
                encoder_hidden_states_t5=None,
                text_embedding_mask_t5=None,
                image_meta_size=None,
                style=None,
                return_dict=False,
                control=None,
                transformer_options=None,
                ):
        """
        Forward pass of the encoder.
        Parameters
        ----------
        x: torch.Tensor
            (B, D, H, W)
        t: torch.Tensor
            (B)
        encoder_hidden_states: torch.Tensor
            CLIP text embedding, (B, L_clip, D)
        text_embedding_mask: torch.Tensor
            CLIP text embedding mask, (B, L_clip)
        encoder_hidden_states_t5: torch.Tensor
            T5 text embedding, (B, L_t5, D)
        text_embedding_mask_t5: torch.Tensor
            T5 text embedding mask, (B, L_t5)
        image_meta_size: torch.Tensor
            (B, 6)
        style: torch.Tensor
            (B)
        cos_cis_img: torch.Tensor
        sin_cis_img: torch.Tensor
        return_dict: bool
            Whether to return a dictionary.
        """
        #import pdb
        #pdb.set_trace()
        encoder_hidden_states = context
        text_states = encoder_hidden_states                     # 2,77,1024
        text_states_t5 = encoder_hidden_states_t5               # 2,256,2048
        text_states_mask = text_embedding_mask.bool()           # 2,77
        text_states_t5_mask = text_embedding_mask_t5.bool()     # 2,256
        b_t5, l_t5, c_t5 = text_states_t5.shape
        text_states_t5 = self.mlp_t5(text_states_t5.view(-1, c_t5)).view(b_t5, l_t5, -1)
        padding = comfy.ops.cast_to_input(self.text_embedding_padding, text_states)
        text_states[:,-self.text_len:] = torch.where(text_states_mask[:,-self.text_len:].unsqueeze(2), text_states[:,-self.text_len:], padding[:self.text_len])
        text_states_t5[:,-self.text_len_t5:] = torch.where(text_states_t5_mask[:,-self.text_len_t5:].unsqueeze(2), text_states_t5[:,-self.text_len_t5:], padding[self.text_len:])
        text_states = torch.cat([text_states, text_states_t5], dim=1)  # 2,205，1024
        # clip_t5_mask = torch.cat([text_states_mask, text_states_t5_mask], dim=-1)
        _, _, oh, ow = x.shape
        th, tw = (oh + (self.patch_size // 2)) // self.patch_size, (ow + (self.patch_size // 2)) // self.patch_size
        # Get image RoPE embedding according to `reso`lution.
        freqs_cis_img = calc_rope(x, self.patch_size, self.hidden_size // self.num_heads) #(cos_cis_img, sin_cis_img)
        # ========================= Build time and image embedding =========================
        t = self.t_embedder(t, dtype=x.dtype)
        x = self.x_embedder(x)
        # ========================= Concatenate all extra vectors =========================
        # Build text tokens with pooling
        extra_vec = self.pooler(encoder_hidden_states_t5)
        # Build image meta size tokens if applicable
        if self.size_cond:
            image_meta_size = timestep_embedding(image_meta_size.view(-1), 256).to(x.dtype)   # [B * 6, 256]
            image_meta_size = image_meta_size.view(-1, 6 * 256)
            extra_vec = torch.cat([extra_vec, image_meta_size], dim=1)  # [B, D + 6 * 256]
        # Build style tokens
        if self.use_style_cond:
            if style is None:
                style = torch.zeros((extra_vec.shape[0],), device=x.device, dtype=torch.int)
            style_embedding = self.style_embedder(style, out_dtype=x.dtype)
            extra_vec = torch.cat([extra_vec, style_embedding], dim=1)
        # Concatenate all extra vectors
        c = t + self.extra_embedder(extra_vec)  # [B, D]
        controls = None
        if control:
            controls = control.get("output", None)
        # ========================= Forward pass through HunYuanDiT blocks =========================
        skips = []
        for layer, block in enumerate(self.blocks):
            if layer > self.depth // 2:
                if controls is not None:
                    skip = skips.pop() + controls.pop().to(dtype=x.dtype)
                else:
                    skip = skips.pop()
                x = block(x, c, text_states, freqs_cis_img, skip)   # (N, L, D)
            else:
                x = block(x, c, text_states, freqs_cis_img)         # (N, L, D)
            if layer < (self.depth // 2 - 1):
                skips.append(x)
        if controls is not None and len(controls) != 0:
            raise ValueError("The number of controls is not equal to the number of skip connections.")
        # ========================= Final layer =========================
        x = self.final_layer(x, c)                              # (N, L, patch_size ** 2 * out_channels)
        x = self.unpatchify(x, th, tw)                          # (N, out_channels, H, W)
        if return_dict:
            return {'x': x}
        if self.learn_sigma:
            return x[:,:self.out_channels // 2,:oh,:ow]
        return x[:,:,:oh,:ow]
    def unpatchify(self, x, h, w):
        """
        x: (N, T, patch_size**2 * C)
        imgs: (N, H, W, C)
        """
        c = self.unpatchify_channels
        p = self.x_embedder.patch_size[0]
        # h = w = int(x.shape[1] ** 0.5)
        assert h * w == x.shape[1]
        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
        x = torch.einsum('nhwpqc->nchpwq', x)
        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
        return imgs
--- a/comfy/ldm/hydit/poolers.py
+++ b/comfy/ldm/hydit/poolers.py
@@ -0,0 +1,37 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.ops
 class AttentionPool(nn.Module):
    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None, dtype=None, device=None, operations=None):
        super().__init__()
        self.positional_embedding = nn.Parameter(torch.empty(spacial_dim + 1, embed_dim, dtype=dtype, device=device))
        self.k_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
        self.q_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
        self.v_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
        self.c_proj = operations.Linear(embed_dim, output_dim or embed_dim, dtype=dtype, device=device)
        self.num_heads = num_heads
        self.embed_dim = embed_dim
    def forward(self, x):
        x = x[:,:self.positional_embedding.shape[0] - 1]
        x = x.permute(1, 0, 2)  # NLC -> LNC
        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
        x = x + comfy.ops.cast_to_input(self.positional_embedding[:, None, :], x) # (L+1)NC
        q = self.q_proj(x[:1])
        k = self.k_proj(x)
        v = self.v_proj(x)
        batch_size = q.shape[1]
        head_dim = self.embed_dim // self.num_heads
        q = q.view(1, batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
        k = k.view(k.shape[0], batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
        v = v.view(v.shape[0], batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
        attn_output = optimized_attention(q, k, v, self.num_heads, skip_reshape=True).transpose(0, 1)
        attn_output = self.c_proj(attn_output)
        return attn_output.squeeze(0)
--- a/comfy/ldm/hydit/posemb_layers.py
+++ b/comfy/ldm/hydit/posemb_layers.py
@@ -0,0 +1,224 @@
 import torch
 import numpy as np
 from typing import Union
 def _to_tuple(x):
    if isinstance(x, int):
        return x, x
    else:
        return x
 def get_fill_resize_and_crop(src, tgt):
    th, tw = _to_tuple(tgt)
    h, w = _to_tuple(src)
    tr = th / tw        # base resolution
    r = h / w           # target resolution
    # resize
    if r > tr:
        resize_height = th
        resize_width = int(round(th / h * w))
    else:
        resize_width = tw
        resize_height = int(round(tw / w * h))    # resize the target resolution down based on the base resolution
    crop_top = int(round((th - resize_height) / 2.0))
    crop_left = int(round((tw - resize_width) / 2.0))
    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
 def get_meshgrid(start, *args):
    if len(args) == 0:
        # start is grid_size
        num = _to_tuple(start)
        start = (0, 0)
        stop = num
    elif len(args) == 1:
        # start is start, args[0] is stop, step is 1
        start = _to_tuple(start)
        stop = _to_tuple(args[0])
        num = (stop[0] - start[0], stop[1] - start[1])
    elif len(args) == 2:
        # start is start, args[0] is stop, args[1] is num
        start = _to_tuple(start)
        stop = _to_tuple(args[0])
        num = _to_tuple(args[1])
    else:
        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
    grid_h = np.linspace(start[0], stop[0], num[0], endpoint=False, dtype=np.float32)
    grid_w = np.linspace(start[1], stop[1], num[1], endpoint=False, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)   # [2, W, H]
    return grid
 #################################################################################
 #                   Sine/Cosine Positional Embedding Functions                  #
 #################################################################################
 # https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
 def get_2d_sincos_pos_embed(embed_dim, start, *args, cls_token=False, extra_tokens=0):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid = get_meshgrid(start, *args)   # [2, H, w]
    # grid_h = np.arange(grid_size, dtype=np.float32)
    # grid_w = np.arange(grid_size, dtype=np.float32)
    # grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    # grid = np.stack(grid, axis=0)   # [2, W, H]
    grid = grid.reshape([2, 1, *grid.shape[1:]])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token and extra_tokens > 0:
        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
    return pos_embed
 def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0
    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
    emb = np.concatenate([emb_h, emb_w], axis=1)    # (H*W, D)
    return emb
 def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (W,H)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float64)
    omega /= embed_dim / 2.
    omega = 1. / 10000**omega  # (D/2,)
    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
    emb_sin = np.sin(out)   # (M, D/2)
    emb_cos = np.cos(out)   # (M, D/2)
    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb
 #################################################################################
 #                   Rotary Positional Embedding Functions                       #
 #################################################################################
 # https://github.com/facebookresearch/llama/blob/main/llama/model.py#L443
 def get_2d_rotary_pos_embed(embed_dim, start, *args, use_real=True):
    """
    This is a 2d version of precompute_freqs_cis, which is a RoPE for image tokens with 2d structure.
    Parameters
    ----------
    embed_dim: int
        embedding dimension size
    start: int or tuple of int
        If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop, step is 1;
        If len(args) == 2, start is start, args[0] is stop, args[1] is num.
    use_real: bool
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
    Returns
    -------
    pos_embed: torch.Tensor
        [HW, D/2]
    """
    grid = get_meshgrid(start, *args)   # [2, H, w]
    grid = grid.reshape([2, 1, *grid.shape[1:]])   # Returns a sampling matrix with the same resolution as the target resolution
    pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)
    return pos_embed
 def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
    assert embed_dim % 4 == 0
    # use half of dimensions to encode grid_h
    emb_h = get_1d_rotary_pos_embed(embed_dim // 2, grid[0].reshape(-1), use_real=use_real)  # (H*W, D/4)
    emb_w = get_1d_rotary_pos_embed(embed_dim // 2, grid[1].reshape(-1), use_real=use_real)  # (H*W, D/4)
    if use_real:
        cos = torch.cat([emb_h[0], emb_w[0]], dim=1)    # (H*W, D/2)
        sin = torch.cat([emb_h[1], emb_w[1]], dim=1)    # (H*W, D/2)
        return cos, sin
    else:
        emb = torch.cat([emb_h, emb_w], dim=1)    # (H*W, D/2)
        return emb
 def get_1d_rotary_pos_embed(dim: int, pos: Union[np.ndarray, int], theta: float = 10000.0, use_real=False):
    """
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
    and the end index 'end'. The 'theta' parameter scales the frequencies.
    The returned tensor contains complex values in complex64 data type.
    Args:
        dim (int): Dimension of the frequency tensor.
        pos (np.ndarray, int): Position indices for the frequency tensor. [S] or scalar
        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
        use_real (bool, optional): If True, return real part and imaginary part separately.
                                   Otherwise, return complex numbers.
    Returns:
        torch.Tensor: Precomputed frequency tensor with complex exponentials. [S, D/2]
    """
    if isinstance(pos, int):
        pos = np.arange(pos)
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [D/2]
    t = torch.from_numpy(pos).to(freqs.device)  # type: ignore  # [S]
    freqs = torch.outer(t, freqs).float()  # type: ignore   # [S, D/2]
    if use_real:
        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
        return freqs_cos, freqs_sin
    else:
        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
        return freqs_cis
 def calc_sizes(rope_img, patch_size, th, tw):
    if rope_img == 'extend':
        # Expansion mode
        sub_args = [(th, tw)]
    elif rope_img.startswith('base'):
        # Based on the specified dimensions, other dimensions are obtained through interpolation.
        base_size = int(rope_img[4:]) // 8 // patch_size
        start, stop = get_fill_resize_and_crop((th, tw), base_size)
        sub_args = [start, stop, (th, tw)]
    else:
        raise ValueError(f"Unknown rope_img: {rope_img}")
    return sub_args
 def init_image_posemb(rope_img,
                      resolutions,
                      patch_size,
                      hidden_size,
                      num_heads,
                      log_fn,
                      rope_real=True,
                      ):
    freqs_cis_img = {}
    for reso in resolutions:
        th, tw = reso.height // 8 // patch_size, reso.width // 8 // patch_size
        sub_args = calc_sizes(rope_img, patch_size, th, tw)
        freqs_cis_img[str(reso)] = get_2d_rotary_pos_embed(hidden_size // num_heads, *sub_args, use_real=rope_real)
        log_fn(f"    Using image RoPE ({rope_img}) ({'real' if rope_real else 'complex'}): {sub_args} | ({reso}) "
               f"{freqs_cis_img[str(reso)][0].shape if rope_real else freqs_cis_img[str(reso)].shape}")
    return freqs_cis_img
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -358,7 +358,7 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
            disabled_xformers = True
    if disabled_xformers:
-        return attention_pytorch(q, k, v, heads, mask)
+        return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape)
    if skip_reshape:
         q, k, v = map(
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -7,6 +7,9 @@ import torch
 import torch.nn as nn
 from .. import attention
 from einops import rearrange, repeat
 from .util import timestep_embedding
 import comfy.ops
 import comfy.ldm.common_dit
 def default(x, y):
    if x is not None:
@@ -68,12 +71,14 @@ class PatchEmbed(nn.Module):
            bias: bool = True,
            strict_img_size: bool = True,
            dynamic_img_pad: bool = True,
            padding_mode='circular',
            dtype=None,
            device=None,
            operations=None,
    ):
        super().__init__()
        self.patch_size = (patch_size, patch_size)
        self.padding_mode = padding_mode
        if img_size is not None:
            self.img_size = (img_size, img_size)
            self.grid_size = tuple([s // p for s, p in zip(self.img_size, self.patch_size)])
@@ -107,9 +112,7 @@ class PatchEmbed(nn.Module):
        #             f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
        #         )
        if self.dynamic_img_pad:
-            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
+            x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size, padding_mode=self.padding_mode)
            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
            x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
@@ -230,34 +233,8 @@ class TimestepEmbedder(nn.Module):
        )
        self.frequency_embedding_size = frequency_embedding_size
    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period)
            * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
            / half
        )
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat(
                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
            )
        if torch.is_floating_point(t):
            embedding = embedding.to(dtype=t.dtype)
        return embedding
    def forward(self, t, dtype, **kwargs):
-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype)
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype)
        t_emb = self.mlp(t_freq)
        return t_emb
@@ -378,29 +355,9 @@ class RMSNorm(torch.nn.Module):
        else:
            self.register_parameter("weight", None)
    def _norm(self, x):
        """
        Apply the RMSNorm normalization to the input tensor.
        Args:
            x (torch.Tensor): The input tensor.
        Returns:
            torch.Tensor: The normalized tensor.
        """
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
    def forward(self, x):
-        """
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
-        Forward pass through the RMSNorm layer.
+
        Args:
            x (torch.Tensor): The input tensor.
        Returns:
            torch.Tensor: The output tensor after applying RMSNorm.
        """
        x = self._norm(x)
        if self.learnable_scale:
            return x * self.weight.to(device=x.device, dtype=x.dtype)
        else:
            return x
 class SwiGLUFeedForward(nn.Module):
@@ -949,7 +906,7 @@ class MMDiT(nn.Module):
            context = self.context_processor(context)
        hw = x.shape[-2:]
-        x = self.x_embedder(x) + self.cropped_pos_embed(hw, device=x.device).to(dtype=x.dtype, device=x.device)
+        x = self.x_embedder(x) + comfy.ops.cast_to_input(self.cropped_pos_embed(hw, device=x.device), x)
        c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        if y is not None and self.y_embedder is not None:
            y = self.y_embedder(y)  # (N, D)
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@@ -809,7 +809,7 @@ class UNetModel(nn.Module):
        self.out = nn.Sequential(
            operations.GroupNorm(32, ch, dtype=self.dtype, device=device),
            nn.SiLU(),
-            zero_module(operations.conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype, device=device)),
+            operations.conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype, device=device),
        )
        if self.predict_codebook_ids:
            self.id_predictor = nn.Sequential(
@@ -842,6 +842,11 @@ class UNetModel(nn.Module):
        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
        emb = self.time_embed(t_emb)
        if "emb_patch" in transformer_patches:
            patch = transformer_patches["emb_patch"]
            for p in patch:
                emb = p(emb, self.model_channels, transformer_options)
        if self.num_classes is not None:
            assert y.shape[0] == x.shape[0]
            emb = emb + self.label_emb(y)
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -1,5 +1,27 @@
 """
    This file is part of ComfyUI.
    Copyright (C) 2024 Comfy
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 from __future__ import annotations
 import comfy.utils
 import comfy.model_management
 import comfy.model_base
 import logging
 import torch
 LORA_CLIP_MAP = {
    "mlp.fc1": "mlp_fc1",
@@ -179,9 +201,13 @@ def load_lora(lora, to_load):
 def model_lora_keys_clip(model, key_map={}):
    sdk = model.state_dict().keys()
    for k in sdk:
        if k.endswith(".weight"):
            key_map["text_encoders.{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
    text_model_lora_key = "lora_te_text_model_encoder_layers_{}_{}"
    clip_l_present = False
    clip_g_present = False
    for b in range(32): #TODO: clean up
        for c in LORA_CLIP_MAP:
            k = "clip_h.transformer.text_model.encoder.layers.{}.{}.weight".format(b, c)
@@ -205,6 +231,7 @@ def model_lora_keys_clip(model, key_map={}):
            k = "clip_g.transformer.text_model.encoder.layers.{}.{}.weight".format(b, c)
            if k in sdk:
                clip_g_present = True
                if clip_l_present:
                    lora_key = "lora_te2_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #SDXL base
                    key_map[lora_key] = k
@@ -218,11 +245,25 @@ def model_lora_keys_clip(model, key_map={}):
                    lora_key = "lora_prior_te_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #cascade lora: TODO put lora key prefix in the model config
                    key_map[lora_key] = k
-    for k in sdk: #OneTrainer SD3 lora
+    for k in sdk:
-        if k.startswith("t5xxl.transformer.") and k.endswith(".weight"):
+        if k.endswith(".weight"):
-            l_key = k[len("t5xxl.transformer."):-len(".weight")]
+            if k.startswith("t5xxl.transformer."):#OneTrainer SD3 and Flux lora
-            lora_key = "lora_te3_{}".format(l_key.replace(".", "_"))
+                l_key = k[len("t5xxl.transformer."):-len(".weight")]
-            key_map[lora_key] = k
+                t5_index = 1
                if clip_g_present:
                    t5_index += 1
                if clip_l_present:
                    t5_index += 1
                    if t5_index == 2:
                        key_map["lora_te{}_{}".format(t5_index, l_key.replace(".", "_"))] = k #OneTrainer Flux
                        t5_index += 1
                key_map["lora_te{}_{}".format(t5_index, l_key.replace(".", "_"))] = k
            elif k.startswith("hydit_clip.transformer.bert."): #HunyuanDiT Lora
                l_key = k[len("hydit_clip.transformer.bert."):-len(".weight")]
                lora_key = "lora_te1_{}".format(l_key.replace(".", "_"))
                key_map[lora_key] = k
    k = "clip_g.transformer.text_projection.weight"
    if k in sdk:
@@ -245,6 +286,7 @@ def model_lora_keys_unet(model, key_map={}):
            key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
            key_map["lora_unet_{}".format(key_lora)] = k
            key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config
            key_map["{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
    diffusers_keys = comfy.utils.unet_to_diffusers(model.model_config.unet_config)
    for k in diffusers_keys:
@@ -252,6 +294,7 @@ def model_lora_keys_unet(model, key_map={}):
            unet_key = "diffusion_model.{}".format(diffusers_keys[k])
            key_lora = k[:-len(".weight")].replace(".", "_")
            key_map["lora_unet_{}".format(key_lora)] = unet_key
            key_map["lycoris_{}".format(key_lora)] = unet_key #simpletuner lycoris format
            diffusers_lora_prefix = ["", "unet."]
            for p in diffusers_lora_prefix:
@@ -274,4 +317,275 @@ def model_lora_keys_unet(model, key_map={}):
                key_lora = "lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_")) #OneTrainer lora
                key_map[key_lora] = to
    if isinstance(model, comfy.model_base.AuraFlow): #Diffusers lora AuraFlow
        diffusers_keys = comfy.utils.auraflow_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
        for k in diffusers_keys:
            if k.endswith(".weight"):
                to = diffusers_keys[k]
                key_lora = "transformer.{}".format(k[:-len(".weight")]) #simpletrainer and probably regular diffusers lora format
                key_map[key_lora] = to
    if isinstance(model, comfy.model_base.HunyuanDiT):
        for k in sdk:
            if k.startswith("diffusion_model.") and k.endswith(".weight"):
                key_lora = k[len("diffusion_model."):-len(".weight")]
                key_map["base_model.model.{}".format(key_lora)] = k #official hunyuan lora format
    if isinstance(model, comfy.model_base.Flux): #Diffusers lora Flux
        diffusers_keys = comfy.utils.flux_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
        for k in diffusers_keys:
            if k.endswith(".weight"):
                to = diffusers_keys[k]
                key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format
                key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris
                key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer
    return key_map
 def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function):
    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, intermediate_dtype)
    lora_diff *= alpha
    weight_calc = weight + function(lora_diff).type(weight.dtype)
    weight_norm = (
        weight_calc.transpose(0, 1)
        .reshape(weight_calc.shape[1], -1)
        .norm(dim=1, keepdim=True)
        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
        .transpose(0, 1)
    )
    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
    if strength != 1.0:
        weight_calc -= weight
        weight += strength * (weight_calc)
    else:
        weight[:] = weight_calc
    return weight
 def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Tensor:
    """
    Pad a tensor to a new shape with zeros.
    Args:
        tensor (torch.Tensor): The original tensor to be padded.
        new_shape (List[int]): The desired shape of the padded tensor.
    Returns:
        torch.Tensor: A new tensor padded with zeros to the specified shape.
    Note:
        If the new shape is smaller than the original tensor in any dimension,
        the original tensor will be truncated in that dimension.
    """
    if any([new_shape[i] < tensor.shape[i] for i in range(len(new_shape))]):
        raise ValueError("The new shape must be larger than the original tensor in all dimensions")
    if len(new_shape) != len(tensor.shape):
        raise ValueError("The new shape must have the same number of dimensions as the original tensor")
    # Create a new tensor filled with zeros
    padded_tensor = torch.zeros(new_shape, dtype=tensor.dtype, device=tensor.device)
    # Create slicing tuples for both tensors
    orig_slices = tuple(slice(0, dim) for dim in tensor.shape)
    new_slices = tuple(slice(0, dim) for dim in tensor.shape)
    # Copy the original tensor into the new tensor
    padded_tensor[new_slices] = tensor[orig_slices]
    return padded_tensor
 def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32):
    for p in patches:
        strength = p[0]
        v = p[1]
        strength_model = p[2]
        offset = p[3]
        function = p[4]
        if function is None:
            function = lambda a: a
        old_weight = None
        if offset is not None:
            old_weight = weight
            weight = weight.narrow(offset[0], offset[1], offset[2])
        if strength_model != 1.0:
            weight *= strength_model
        if isinstance(v, list):
            v = (calculate_weight(v[1:], v[0][1](comfy.model_management.cast_to_device(v[0][0], weight.device, intermediate_dtype, copy=True), inplace=True), key, intermediate_dtype=intermediate_dtype), )
        if len(v) == 1:
            patch_type = "diff"
        elif len(v) == 2:
            patch_type = v[0]
            v = v[1]
        if patch_type == "diff":
            diff: torch.Tensor = v[0]
            # An extra flag to pad the weight if the diff's shape is larger than the weight
            do_pad_weight = len(v) > 1 and v[1]['pad_weight']
            if do_pad_weight and diff.shape != weight.shape:
                logging.info("Pad weight {} from {} to shape: {}".format(key, weight.shape, diff.shape))
                weight = pad_tensor_to_shape(weight, diff.shape)
            if strength != 0.0:
                if diff.shape != weight.shape:
                    logging.warning("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, diff.shape, weight.shape))
                else:
                    weight += function(strength * comfy.model_management.cast_to_device(diff, weight.device, weight.dtype))
        elif patch_type == "lora": #lora/locon
            mat1 = comfy.model_management.cast_to_device(v[0], weight.device, intermediate_dtype)
            mat2 = comfy.model_management.cast_to_device(v[1], weight.device, intermediate_dtype)
            dora_scale = v[4]
            if v[2] is not None:
                alpha = v[2] / mat2.shape[0]
            else:
                alpha = 1.0
            if v[3] is not None:
                #locon mid weights, hopefully the math is fine because I didn't properly test it
                mat3 = comfy.model_management.cast_to_device(v[3], weight.device, intermediate_dtype)
                final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
                mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
            try:
                lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        elif patch_type == "lokr":
            w1 = v[0]
            w2 = v[1]
            w1_a = v[3]
            w1_b = v[4]
            w2_a = v[5]
            w2_b = v[6]
            t2 = v[7]
            dora_scale = v[8]
            dim = None
            if w1 is None:
                dim = w1_b.shape[0]
                w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, intermediate_dtype),
                                comfy.model_management.cast_to_device(w1_b, weight.device, intermediate_dtype))
            else:
                w1 = comfy.model_management.cast_to_device(w1, weight.device, intermediate_dtype)
            if w2 is None:
                dim = w2_b.shape[0]
                if t2 is None:
                    w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype))
                else:
                    w2 = torch.einsum('i j k l, j r, i p -> p r k l',
                                        comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
                                        comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype),
                                        comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype))
            else:
                w2 = comfy.model_management.cast_to_device(w2, weight.device, intermediate_dtype)
            if len(w2.shape) == 4:
                w1 = w1.unsqueeze(2).unsqueeze(2)
            if v[2] is not None and dim is not None:
                alpha = v[2] / dim
            else:
                alpha = 1.0
            try:
                lora_diff = torch.kron(w1, w2).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        elif patch_type == "loha":
            w1a = v[0]
            w1b = v[1]
            if v[2] is not None:
                alpha = v[2] / w1b.shape[0]
            else:
                alpha = 1.0
            w2a = v[3]
            w2b = v[4]
            dora_scale = v[7]
            if v[5] is not None: #cp decomposition
                t1 = v[5]
                t2 = v[6]
                m1 = torch.einsum('i j k l, j r, i p -> p r k l',
                                    comfy.model_management.cast_to_device(t1, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype))
                m2 = torch.einsum('i j k l, j r, i p -> p r k l',
                                    comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype))
            else:
                m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype),
                                comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype))
                m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype),
                                comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype))
            try:
                lora_diff = (m1 * m2).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        elif patch_type == "glora":
            dora_scale = v[5]
            old_glora = False
            if v[3].shape[1] == v[2].shape[0] == v[0].shape[0] == v[1].shape[1]:
                rank = v[0].shape[0]
                old_glora = True
            if v[3].shape[0] == v[2].shape[1] == v[0].shape[1] == v[1].shape[0]:
                if old_glora and v[1].shape[0] == weight.shape[0] and weight.shape[0] == weight.shape[1]:
                    pass
                else:
                    old_glora = False
                    rank = v[1].shape[0]
            a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, intermediate_dtype)
            a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, intermediate_dtype)
            b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, intermediate_dtype)
            b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, intermediate_dtype)
            if v[4] is not None:
                alpha = v[4] / rank
            else:
                alpha = 1.0
            try:
                if old_glora:
                    lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2), a1)).reshape(weight.shape) #old lycoris glora
                else:
                    if weight.dim() > 2:
                        lora_diff = torch.einsum("o i ..., i j -> o j ...", torch.einsum("o i ..., i j -> o j ...", weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
                    else:
                        lora_diff = torch.mm(torch.mm(weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
                    lora_diff += torch.mm(b1, b2).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        else:
            logging.warning("patch type not recognized {} {}".format(patch_type, key))
        if old_weight is not None:
            weight = old_weight
    return weight
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1,3 +1,21 @@
 """
    This file is part of ComfyUI.
    Copyright (C) 2024 Comfy
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import torch
 import logging
 from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
@@ -7,8 +25,11 @@ from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugme
 from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
 from comfy.ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper
 import comfy.ldm.aura.mmdit
 import comfy.ldm.hydit.models
 import comfy.ldm.audio.dit
 import comfy.ldm.audio.embedders
 import comfy.ldm.flux.model
 import comfy.model_management
 import comfy.conds
 import comfy.ops
@@ -25,6 +46,7 @@ class ModelType(Enum):
    EDM = 5
    FLOW = 6
    V_PREDICTION_CONTINUOUS = 7
    FLUX = 8
 from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
@@ -52,6 +74,9 @@ def model_sampling(model_config, model_type):
    elif model_type == ModelType.V_PREDICTION_CONTINUOUS:
        c = V_PREDICTION
        s = ModelSamplingContinuousV
    elif model_type == ModelType.FLUX:
        c = comfy.model_sampling.CONST
        s = comfy.model_sampling.ModelSamplingFlux
    class ModelSampling(s, c):
        pass
@@ -67,16 +92,19 @@ class BaseModel(torch.nn.Module):
        self.latent_format = model_config.latent_format
        self.model_config = model_config
        self.manual_cast_dtype = model_config.manual_cast_dtype
        self.device = device
        if not unet_config.get("disable_unet_model_creation", False):
-            if self.manual_cast_dtype is not None:
+            if model_config.custom_operations is None:
-                operations = comfy.ops.manual_cast
+                fp8 = model_config.optimizations.get("fp8", model_config.scaled_fp8 is not None)
                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
            else:
-                operations = comfy.ops.disable_weight_init
+                operations = model_config.custom_operations
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
            if comfy.model_management.force_channels_last():
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
            logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
        self.model_type = model_type
        self.model_sampling = model_sampling(model_config, model_type)
@@ -87,6 +115,7 @@ class BaseModel(torch.nn.Module):
        self.concat_keys = ()
        logging.info("model_type {}".format(model_type.name))
        logging.debug("adm {}".format(self.adm_channels))
        self.memory_usage_factor = model_config.memory_usage_factor
    def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        sigma = t
@@ -216,6 +245,10 @@ class BaseModel(torch.nn.Module):
            extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
        unet_state_dict = self.diffusion_model.state_dict()
        if self.model_config.scaled_fp8 is not None:
            unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8)
        unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
        if self.model_type == ModelType.V_PREDICTION:
@@ -245,11 +278,11 @@ class BaseModel(torch.nn.Module):
                dtype = self.manual_cast_dtype
            #TODO: this needs to be tweaked
            area = input_shape[0] * math.prod(input_shape[2:])
-            return (area * comfy.model_management.dtype_size(dtype) / 50) * (1024 * 1024)
+            return (area * comfy.model_management.dtype_size(dtype) * 0.01 * self.memory_usage_factor) * (1024 * 1024)
        else:
            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
            area = input_shape[0] * math.prod(input_shape[2:])
-            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
+            return (area * 0.15 * self.memory_usage_factor) * (1024 * 1024)
 def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
@@ -347,6 +380,7 @@ class SDXL(BaseModel):
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
 class SVD_img2vid(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
        super().__init__(model_config, model_type, device=device)
@@ -587,17 +621,6 @@ class SD3(BaseModel):
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out
    def memory_required(self, input_shape):
        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
            dtype = self.get_dtype()
            if self.manual_cast_dtype is not None:
                dtype = self.manual_cast_dtype
            #TODO: this probably needs to be tweaked
            area = input_shape[0] * input_shape[2] * input_shape[3]
            return (area * comfy.model_management.dtype_size(dtype) * 0.012) * (1024 * 1024)
        else:
            area = input_shape[0] * input_shape[2] * input_shape[3]
            return (area * 0.3) * (1024 * 1024)
 class AuraFlow(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
@@ -648,3 +671,50 @@ class StableAudio1(BaseModel):
            for l in s:
                sd["{}{}".format(k, l)] = s[l]
        return sd
 class HunyuanDiT(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hydit.models.HunYuanDiT)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        attention_mask = kwargs.get("attention_mask", None)
        if attention_mask is not None:
            out['text_embedding_mask'] = comfy.conds.CONDRegular(attention_mask)
        conditioning_mt5xl = kwargs.get("conditioning_mt5xl", None)
        if conditioning_mt5xl is not None:
            out['encoder_hidden_states_t5'] = comfy.conds.CONDRegular(conditioning_mt5xl)
        attention_mask_mt5xl = kwargs.get("attention_mask_mt5xl", None)
        if attention_mask_mt5xl is not None:
            out['text_embedding_mask_t5'] = comfy.conds.CONDRegular(attention_mask_mt5xl)
        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
        crop_w = kwargs.get("crop_w", 0)
        crop_h = kwargs.get("crop_h", 0)
        target_width = kwargs.get("target_width", width)
        target_height = kwargs.get("target_height", height)
        out['image_meta_size'] = comfy.conds.CONDRegular(torch.FloatTensor([[height, width, target_height, target_width, 0, 0]]))
        return out
 class Flux(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)
    def encode_adm(self, **kwargs):
        return kwargs["pooled_output"]
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([kwargs.get("guidance", 3.5)]))
        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -109,8 +109,42 @@ def detect_unet_config(state_dict, key_prefix):
        unet_config = {}
        unet_config["max_seq"] = state_dict['{}positional_encoding'.format(key_prefix)].shape[1]
        unet_config["cond_seq_dim"] = state_dict['{}cond_seq_linear.weight'.format(key_prefix)].shape[1]
        double_layers = count_blocks(state_dict_keys, '{}double_layers.'.format(key_prefix) + '{}.')
        single_layers = count_blocks(state_dict_keys, '{}single_layers.'.format(key_prefix) + '{}.')
        unet_config["n_double_layers"] = double_layers
        unet_config["n_layers"] = double_layers + single_layers
        return unet_config
    if '{}mlp_t5.0.weight'.format(key_prefix) in state_dict_keys: #Hunyuan DiT
        unet_config = {}
        unet_config["image_model"] = "hydit"
        unet_config["depth"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
        unet_config["hidden_size"] = state_dict['{}x_embedder.proj.weight'.format(key_prefix)].shape[0]
        if unet_config["hidden_size"] == 1408 and unet_config["depth"] == 40: #DiT-g/2
            unet_config["mlp_ratio"] = 4.3637
        if state_dict['{}extra_embedder.0.weight'.format(key_prefix)].shape[1] == 3968:
            unet_config["size_cond"] = True
            unet_config["use_style_cond"] = True
            unet_config["image_model"] = "hydit1"
        return unet_config
    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys: #Flux
        dit_config = {}
        dit_config["image_model"] = "flux"
        dit_config["in_channels"] = 16
        dit_config["vec_in_dim"] = 768
        dit_config["context_in_dim"] = 4096
        dit_config["hidden_size"] = 3072
        dit_config["mlp_ratio"] = 4.0
        dit_config["num_heads"] = 24
        dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
        dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
        dit_config["axes_dim"] = [16, 56, 56]
        dit_config["theta"] = 10000
        dit_config["qkv_bias"] = True
        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
        return dit_config
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None
@@ -252,18 +286,33 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
        return None
    model_config = model_config_from_unet_config(unet_config, state_dict)
    if model_config is None and use_base_if_no_match:
-        return comfy.supported_models_base.BASE(unet_config)
+        model_config = comfy.supported_models_base.BASE(unet_config)
-    else:
+
-        return model_config
+    scaled_fp8_weight = state_dict.get("{}scaled_fp8".format(unet_key_prefix), None)
    if scaled_fp8_weight is not None:
        model_config.scaled_fp8 = scaled_fp8_weight.dtype
        if model_config.scaled_fp8 == torch.float32:
            model_config.scaled_fp8 = torch.float8_e4m3fn
    return model_config
 def unet_prefix_from_state_dict(state_dict):
-    if "model.model.postprocess_conv.weight" in state_dict: #audio models
+    candidates = ["model.diffusion_model.", #ldm/sgm models
-        unet_key_prefix = "model.model."
+                  "model.model.", #audio models
-    elif "model.double_layers.0.attn.w1q.weight" in state_dict: #aura flow
+                  ]
-        unet_key_prefix = "model."
+    counts = {k: 0 for k in candidates}
    for k in state_dict:
        for c in candidates:
            if k.startswith(c):
                counts[c] += 1
                break
    top = max(counts, key=counts.get)
    if counts[top] > 5:
        return top
    else:
-        unet_key_prefix = "model.diffusion_model."
+        return "model." #aura flow and others
-    return unet_key_prefix
+
 def convert_config(unet_config):
    new_config = unet_config.copy()
@@ -429,9 +478,15 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
            'transformer_depth': [0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': -2, 'use_linear_in_transformer': False,
            'context_dim': 768, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 1, 1, 1, 1],
            'use_temporal_attention': False, 'use_temporal_resblock': False}
    SD15_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
            'dtype': dtype, 'in_channels': 9, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8,
            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
            'use_temporal_attention': False, 'use_temporal_resblock': False}  
-    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p]
+    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p, SD15_diffusers_inpaint]
    for unet_config in supported_models:
        matches = True
@@ -450,37 +505,55 @@ def model_config_from_diffusers_unet(state_dict):
    return None
 def convert_diffusers_mmdit(state_dict, output_prefix=""):
-    num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.')
+    out_sd = {}
-    if num_blocks > 0:
+
    if 'transformer_blocks.0.attn.norm_added_k.weight' in state_dict: #Flux
        depth = count_blocks(state_dict, 'transformer_blocks.{}.')
        depth_single_blocks = count_blocks(state_dict, 'single_transformer_blocks.{}.')
        hidden_size = state_dict["x_embedder.bias"].shape[0]
        sd_map = comfy.utils.flux_to_diffusers({"depth": depth, "depth_single_blocks": depth_single_blocks, "hidden_size": hidden_size}, output_prefix=output_prefix)
    elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict: #SD3
        num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.')
        depth = state_dict["pos_embed.proj.weight"].shape[0] // 64
        out_sd = {}
        sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix)
-        for k in sd_map:
+    elif 'joint_transformer_blocks.0.attn.add_k_proj.weight' in state_dict: #AuraFlow
-            weight = state_dict.get(k, None)
+        num_joint = count_blocks(state_dict, 'joint_transformer_blocks.{}.')
-            if weight is not None:
+        num_single = count_blocks(state_dict, 'single_transformer_blocks.{}.')
-                t = sd_map[k]
+        sd_map = comfy.utils.auraflow_to_diffusers({"n_double_layers": num_joint, "n_layers": num_joint + num_single}, output_prefix=output_prefix)
    else:
        return None
-                if not isinstance(t, str):
+    for k in sd_map:
-                    if len(t) > 2:
+        weight = state_dict.get(k, None)
-                        fun = t[2]
+        if weight is not None:
-                    else:
+            t = sd_map[k]
                        fun = lambda a: a
                    offset = t[1]
                    if offset is not None:
                        old_weight = out_sd.get(t[0], None)
                        if old_weight is None:
                            old_weight = torch.empty_like(weight)
                            old_weight = old_weight.repeat([3] + [1] * (len(old_weight.shape) - 1))
-                        w = old_weight.narrow(offset[0], offset[1], offset[2])
+            if not isinstance(t, str):
-                    else:
+                if len(t) > 2:
-                        old_weight = weight
+                    fun = t[2]
                        w = weight
                    w[:] = fun(weight)
                    t = t[0]
                    out_sd[t] = old_weight
                else:
-                    out_sd[t] = weight
+                    fun = lambda a: a
-                state_dict.pop(k)
+                offset = t[1]
                if offset is not None:
                    old_weight = out_sd.get(t[0], None)
                    if old_weight is None:
                        old_weight = torch.empty_like(weight)
                    if old_weight.shape[offset[0]] < offset[1] + offset[2]:
                        exp = list(weight.shape)
                        exp[offset[0]] = offset[1] + offset[2]
                        new = torch.empty(exp, device=weight.device, dtype=weight.dtype)
                        new[:old_weight.shape[0]] = old_weight
                        old_weight = new
                    w = old_weight.narrow(offset[0], offset[1], offset[2])
                else:
                    old_weight = weight
                    w = weight
                w[:] = fun(weight)
                t = t[0]
                out_sd[t] = old_weight
            else:
                out_sd[t] = weight
            state_dict.pop(k)
    return out_sd
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1,3 +1,21 @@
 """
    This file is part of ComfyUI.
    Copyright (C) 2024 Comfy
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import psutil
 import logging
 from enum import Enum
@@ -26,9 +44,15 @@ cpu_state = CPUState.GPU
 total_vram = 0
 lowvram_available = True
 xpu_available = False
 torch_version = ""
 try:
    torch_version = torch.version.__version__
    xpu_available = (int(torch_version[0]) < 2 or (int(torch_version[0]) == 2 and int(torch_version[2]) <= 4)) and torch.xpu.is_available()
 except:
    pass
 lowvram_available = True
 if args.deterministic:
    logging.info("Using deterministic algorithms for pytorch")
    torch.use_deterministic_algorithms(True, warn_only=True)
@@ -48,10 +72,10 @@ if args.directml is not None:
 try:
    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
+    _ = torch.xpu.device_count()
-        xpu_available = True
+    xpu_available = torch.xpu.is_available()
 except:
-    pass
+    xpu_available = xpu_available or (hasattr(torch, "xpu") and torch.xpu.is_available())
 try:
    if torch.backends.mps.is_available():
@@ -121,7 +145,7 @@ total_ram = psutil.virtual_memory().total / (1024 * 1024)
 logging.info("Total VRAM {:0.0f} MB, total RAM {:0.0f} MB".format(total_vram, total_ram))
 try:
-    logging.info("pytorch version: {}".format(torch.version.__version__))
+    logging.info("pytorch version: {}".format(torch_version))
 except:
    pass
@@ -171,7 +195,6 @@ VAE_DTYPES = [torch.float32]
 try:
    if is_nvidia():
        torch_version = torch.version.__version__
        if int(torch_version[0]) >= 2:
            if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                ENABLE_PYTORCH_ATTENTION = True
@@ -273,9 +296,12 @@ class LoadedModel:
    def model_memory(self):
        return self.model.model_size()
    def model_offloaded_memory(self):
        return self.model.model_size() - self.model.loaded_size()
    def model_memory_required(self, device):
-        if device == self.model.current_device:
+        if device == self.model.current_loaded_device():
-            return 0
+            return self.model_offloaded_memory()
        else:
            return self.model_memory()
@@ -287,38 +313,78 @@ class LoadedModel:
        load_weights = not self.weights_loaded
-        try:
+        if self.model.loaded_size() > 0:
-            if lowvram_model_memory > 0 and load_weights:
+            use_more_vram = lowvram_model_memory
-                self.real_model = self.model.patch_model_lowvram(device_to=patch_model_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights)
+            if use_more_vram == 0:
-            else:
+                use_more_vram = 1e32
-                self.real_model = self.model.patch_model(device_to=patch_model_to, patch_weights=load_weights)
+            self.model_use_more_vram(use_more_vram)
-        except Exception as e:
+        else:
-            self.model.unpatch_model(self.model.offload_device)
+            try:
-            self.model_unload()
+                self.real_model = self.model.patch_model(device_to=patch_model_to, lowvram_model_memory=lowvram_model_memory, load_weights=load_weights, force_patch_weights=force_patch_weights)
-            raise e
+            except Exception as e:
                self.model.unpatch_model(self.model.offload_device)
                self.model_unload()
                raise e
-        if is_intel_xpu() and not args.disable_ipex_optimize:
+        if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and self.real_model is not None:
-            self.real_model = ipex.optimize(self.real_model.eval(), graph_mode=True, concat_linear=True)
+            with torch.no_grad():
                self.real_model = ipex.optimize(self.real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)
        self.weights_loaded = True
        return self.real_model
    def should_reload_model(self, force_patch_weights=False):
-        if force_patch_weights and self.model.lowvram_patch_counter > 0:
+        if force_patch_weights and self.model.lowvram_patch_counter() > 0:
            return True
        return False
-    def model_unload(self, unpatch_weights=True):
+    def model_unload(self, memory_to_free=None, unpatch_weights=True):
        if memory_to_free is not None:
            if memory_to_free < self.model.loaded_size():
                freed = self.model.partially_unload(self.model.offload_device, memory_to_free)
                if freed >= memory_to_free:
                    return False
        self.model.unpatch_model(self.model.offload_device, unpatch_weights=unpatch_weights)
        self.model.model_patches_to(self.model.offload_device)
        self.weights_loaded = self.weights_loaded and not unpatch_weights
        self.real_model = None
        return True
    def model_use_more_vram(self, extra_memory):
        return self.model.partially_load(self.device, extra_memory)
    def __eq__(self, other):
        return self.model is other.model
 def use_more_memory(extra_memory, loaded_models, device):
    for m in loaded_models:
        if m.device == device:
            extra_memory -= m.model_use_more_vram(extra_memory)
            if extra_memory <= 0:
                break
 def offloaded_memory(loaded_models, device):
    offloaded_mem = 0
    for m in loaded_models:
        if m.device == device:
            offloaded_mem += m.model_offloaded_memory()
    return offloaded_mem
 WINDOWS = any(platform.win32_ver())
 EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
 if WINDOWS:
    EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
 if args.reserve_vram is not None:
    EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
    logging.debug("Reserving {}MB vram for other applications.".format(EXTRA_RESERVED_VRAM / (1024 * 1024)))
 def extra_reserved_memory():
    return EXTRA_RESERVED_VRAM
 def minimum_inference_memory():
-    return (1024 * 1024 * 1024)
+    return (1024 * 1024 * 1024) * 0.8 + extra_reserved_memory()
 def unload_model_clones(model, unload_weights_only=True, force_unload=True):
    to_unload = []
@@ -342,6 +408,8 @@ def unload_model_clones(model, unload_weights_only=True, force_unload=True):
    if not force_unload:
        if unload_weights_only and unload_weight == False:
            return None
    else:
        unload_weight = True
    for i in to_unload:
        logging.debug("unload clone {} {}".format(i, unload_weight))
@@ -352,24 +420,29 @@ def unload_model_clones(model, unload_weights_only=True, force_unload=True):
 def free_memory(memory_required, device, keep_loaded=[]):
    unloaded_model = []
    can_unload = []
    unloaded_models = []
    for i in range(len(current_loaded_models) -1, -1, -1):
        shift_model = current_loaded_models[i]
        if shift_model.device == device:
            if shift_model not in keep_loaded:
-                can_unload.append((sys.getrefcount(shift_model.model), shift_model.model_memory(), i))
+                can_unload.append((-shift_model.model_offloaded_memory(), sys.getrefcount(shift_model.model), shift_model.model_memory(), i))
                shift_model.currently_used = False
    for x in sorted(can_unload):
        i = x[-1]
        memory_to_free = None
        if not DISABLE_SMART_MEMORY:
-            if get_free_memory(device) > memory_required:
+            free_mem = get_free_memory(device)
            if free_mem > memory_required:
                break
-        current_loaded_models[i].model_unload()
+            memory_to_free = memory_required - free_mem
-        unloaded_model.append(i)
+        logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
        if current_loaded_models[i].model_unload(memory_to_free):
            unloaded_model.append(i)
    for i in sorted(unloaded_model, reverse=True):
-        current_loaded_models.pop(i)
+        unloaded_models.append(current_loaded_models.pop(i))
    if len(unloaded_model) > 0:
        soft_empty_cache()
@@ -378,12 +451,17 @@ def free_memory(memory_required, device, keep_loaded=[]):
            mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
            if mem_free_torch > mem_free_total * 0.25:
                soft_empty_cache()
    return unloaded_models
-def load_models_gpu(models, memory_required=0, force_patch_weights=False):
+def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
    global vram_state
    inference_memory = minimum_inference_memory()
-    extra_mem = max(inference_memory, memory_required)
+    extra_mem = max(inference_memory, memory_required + extra_reserved_memory())
    if minimum_memory_required is None:
        minimum_memory_required = extra_mem
    else:
        minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory())
    models = set(models)
@@ -416,25 +494,36 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False):
        devs = set(map(lambda a: a.device, models_already_loaded))
        for d in devs:
            if d != torch.device("cpu"):
-                free_memory(extra_mem, d, models_already_loaded)
+                free_memory(extra_mem + offloaded_memory(models_already_loaded, d), d, models_already_loaded)
-        return
+                free_mem = get_free_memory(d)
                if free_mem < minimum_memory_required:
                    logging.info("Unloading models for lowram load.") #TODO: partial model unloading when this case happens, also handle the opposite case where models can be unlowvramed.
                    models_to_load = free_memory(minimum_memory_required, d)
                    logging.info("{} models unloaded.".format(len(models_to_load)))
                else:
                    use_more_memory(free_mem - minimum_memory_required, models_already_loaded, d)
        if len(models_to_load) == 0:
            return
    logging.info(f"Loading {len(models_to_load)} new model{'s' if len(models_to_load) > 1 else ''}")
    total_memory_required = {}
    for loaded_model in models_to_load:
-        if unload_model_clones(loaded_model.model, unload_weights_only=True, force_unload=False) == True:#unload clones where the weights are different
+        unload_model_clones(loaded_model.model, unload_weights_only=True, force_unload=False) #unload clones where the weights are different
-            total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
+        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
-    for device in total_memory_required:
+    for loaded_model in models_already_loaded:
-        if device != torch.device("cpu"):
+        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
            free_memory(total_memory_required[device] * 1.3 + extra_mem, device, models_already_loaded)
    for loaded_model in models_to_load:
        weights_unloaded = unload_model_clones(loaded_model.model, unload_weights_only=False, force_unload=False) #unload the rest of the clones where the weights can stay loaded
        if weights_unloaded is not None:
            loaded_model.weights_loaded = not weights_unloaded
    for device in total_memory_required:
        if device != torch.device("cpu"):
            free_memory(total_memory_required[device] * 1.1 + extra_mem, device, models_already_loaded)
    for loaded_model in models_to_load:
        model = loaded_model.model
        torch_dev = model.load_device
@@ -443,11 +532,11 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False):
        else:
            vram_set_state = vram_state
        lowvram_model_memory = 0
-        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM):
+        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM) and not force_full_load:
            model_size = loaded_model.model_memory_required(torch_dev)
            current_free_mem = get_free_memory(torch_dev)
-            lowvram_model_memory = int(max(64 * (1024 * 1024), (current_free_mem - 1024 * (1024 * 1024)) / 1.3 ))
+            lowvram_model_memory = max(64 * (1024 * 1024), (current_free_mem - minimum_memory_required), min(current_free_mem * 0.4, current_free_mem - minimum_inference_memory()))
-            if model_size <= (current_free_mem - inference_memory): #only switch to lowvram if really necessary
+            if model_size <= lowvram_model_memory: #only switch to lowvram if really necessary
                lowvram_model_memory = 0
        if vram_set_state == VRAMState.NO_VRAM:
@@ -455,6 +544,14 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False):
        cur_loaded_model = loaded_model.model_load(lowvram_model_memory, force_patch_weights=force_patch_weights)
        current_loaded_models.insert(0, loaded_model)
    devs = set(map(lambda a: a.device, models_already_loaded))
    for d in devs:
        if d != torch.device("cpu"):
            free_mem = get_free_memory(d)
            if free_mem > minimum_memory_required:
                use_more_memory(free_mem - minimum_memory_required, models_already_loaded, d)
    return
@@ -474,7 +571,9 @@ def loaded_models(only_currently_used=False):
 def cleanup_models(keep_clone_weights_loaded=False):
    to_delete = []
    for i in range(len(current_loaded_models)):
-        if sys.getrefcount(current_loaded_models[i].model) <= 2:
+        #TODO: very fragile function needs improvement
        num_refs = sys.getrefcount(current_loaded_models[i].model)
        if num_refs <= 2:
            if not keep_clone_weights_loaded:
                to_delete = [i] + to_delete
            #TODO: find a less fragile way to do this.
@@ -523,7 +622,12 @@ def unet_inital_load_device(parameters, dtype):
    else:
        return cpu_dev
 def maximum_vram_for_weights(device=None):
    return (get_total_memory(device) * 0.88 - minimum_inference_memory())
 def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
    if model_params < 0:
        model_params = 1000000000000000000000
    if args.bf16_unet:
        return torch.bfloat16
    if args.fp16_unet:
@@ -532,12 +636,40 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
        return torch.float8_e4m3fn
    if args.fp8_e5m2_unet:
        return torch.float8_e5m2
-    if should_use_fp16(device=device, model_params=model_params, manual_cast=True):
+
-        if torch.float16 in supported_dtypes:
+    fp8_dtype = None
-            return torch.float16
+    try:
-    if should_use_bf16(device, model_params=model_params, manual_cast=True):
+        for dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        if torch.bfloat16 in supported_dtypes:
+            if dtype in supported_dtypes:
-            return torch.bfloat16
+                fp8_dtype = dtype
                break
    except:
        pass
    if fp8_dtype is not None:
        if supports_fp8_compute(device): #if fp8 compute is supported the casting is most likely not expensive
            return fp8_dtype
        free_model_memory = maximum_vram_for_weights(device)
        if model_params * 2 > free_model_memory:
            return fp8_dtype
    for dt in supported_dtypes:
        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params):
            if torch.float16 in supported_dtypes:
                return torch.float16
        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params):
            if torch.bfloat16 in supported_dtypes:
                return torch.bfloat16
    for dt in supported_dtypes:
        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params, manual_cast=True):
            if torch.float16 in supported_dtypes:
                return torch.float16
        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params, manual_cast=True):
            if torch.bfloat16 in supported_dtypes:
                return torch.bfloat16
    return torch.float32
 # None means no manual cast
@@ -553,13 +685,14 @@ def unet_manual_cast(weight_dtype, inference_device, supported_dtypes=[torch.flo
    if bf16_supported and weight_dtype == torch.bfloat16:
        return None
-    if fp16_supported and torch.float16 in supported_dtypes:
+    fp16_supported = should_use_fp16(inference_device, prioritize_performance=True)
-        return torch.float16
+    for dt in supported_dtypes:
        if dt == torch.float16 and fp16_supported:
            return torch.float16
        if dt == torch.bfloat16 and bf16_supported:
            return torch.bfloat16
-    elif bf16_supported and torch.bfloat16 in supported_dtypes:
+    return torch.float32
        return torch.bfloat16
    else:
        return torch.float32
 def text_encoder_offload_device():
    if args.gpu_only:
@@ -578,6 +711,20 @@ def text_encoder_device():
    else:
        return torch.device("cpu")
 def text_encoder_initial_device(load_device, offload_device, model_size=0):
    if load_device == offload_device or model_size <= 1024 * 1024 * 1024:
        return offload_device
    if is_device_mps(load_device):
        return offload_device
    mem_l = get_free_memory(load_device)
    mem_o = get_free_memory(offload_device)
    if mem_l > (mem_o * 0.5) and model_size * 1.2 < mem_l:
        return load_device
    else:
        return offload_device
 def text_encoder_dtype(device=None):
    if args.fp8_e4m3fn_text_enc:
        return torch.float8_e4m3fn
@@ -649,18 +796,29 @@ def supports_cast(device, dtype): #TODO
        return True
    if dtype == torch.float16:
        return True
    if is_device_mps(device):
        return False
    if directml_enabled: #TODO: test this
        return False
    if dtype == torch.bfloat16:
        return True
    if is_device_mps(device):
        return False
    if dtype == torch.float8_e4m3fn:
        return True
    if dtype == torch.float8_e5m2:
        return True
    return False
 def pick_weight_dtype(dtype, fallback_dtype, device=None):
    if dtype is None:
        dtype = fallback_dtype
    elif dtype_size(dtype) > dtype_size(fallback_dtype):
        dtype = fallback_dtype
    if not supports_cast(device, dtype):
        dtype = fallback_dtype
    return dtype
 def device_supports_non_blocking(device):
    if is_device_mps(device):
        return False #pytorch bug? mps doesn't support non blocking
@@ -685,27 +843,21 @@ def force_channels_last():
    #TODO
    return False
 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False):
    if device is None or weight.device == device:
        if not copy:
            if dtype is None or weight.dtype == dtype:
                return weight
        return weight.to(dtype=dtype, copy=copy)
    r = torch.empty_like(weight, dtype=dtype, device=device)
    r.copy_(weight, non_blocking=non_blocking)
    return r
 def cast_to_device(tensor, device, dtype, copy=False):
-    device_supports_cast = False
+    non_blocking = device_supports_non_blocking(device)
-    if tensor.dtype == torch.float32 or tensor.dtype == torch.float16:
+    return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy)
        device_supports_cast = True
    elif tensor.dtype == torch.bfloat16:
        if hasattr(device, 'type') and device.type.startswith("cuda"):
            device_supports_cast = True
        elif is_intel_xpu():
            device_supports_cast = True
    non_blocking = device_should_use_non_blocking(device)
    if device_supports_cast:
        if copy:
            if tensor.device == device:
                return tensor.to(dtype, copy=copy, non_blocking=non_blocking)
            return tensor.to(device, copy=copy, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
        else:
            return tensor.to(device, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
    else:
        return tensor.to(device, dtype, copy=copy, non_blocking=non_blocking)
 def xformers_enabled():
    global directml_enabled
@@ -743,7 +895,8 @@ def pytorch_attention_flash_attention():
 def force_upcast_attention_dtype():
    upcast = args.force_upcast_attention
    try:
-        if platform.mac_ver()[0] in ['14.5']: #black image bug on OSX Sonoma 14.5
+        macos_version = tuple(int(n) for n in platform.mac_ver()[0].split("."))
        if (14, 5) <= macos_version <= (15, 0, 1):  # black image bug on recent versions of macOS
            upcast = True
    except:
        pass
@@ -839,24 +992,24 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
    if torch.version.hip:
        return True
-    props = torch.cuda.get_device_properties("cuda")
+    props = torch.cuda.get_device_properties(device)
    if props.major >= 8:
        return True
    if props.major < 6:
        return False
-    fp16_works = False
+    #FP16 is confirmed working on a 1080 (GP104) and on latest pytorch actually seems faster than fp32
    #FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled
    #when the model doesn't actually fit on the card
    #TODO: actually test if GP106 and others have the same type of behavior
    nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050", "p40", "p100", "p6", "p4"]
    for x in nvidia_10_series:
        if x in props.name.lower():
-            fp16_works = True
+            if WINDOWS or manual_cast:
                return True
            else:
                return False #weird linux behavior where fp32 is faster
-    if fp16_works or manual_cast:
+    if manual_cast:
-        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
+        free_model_memory = maximum_vram_for_weights(device)
        if (not prioritize_performance) or model_params * 4 > free_model_memory:
            return True
@@ -876,9 +1029,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
        if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
            return False
-    if device is not None: #TODO not sure about mps bf16 support
+    if device is not None:
        if is_device_mps(device):
-            return False
+            return True
    if FORCE_FP32:
        return False
@@ -886,15 +1039,15 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    if directml_enabled:
        return False
-    if cpu_mode() or mps_mode():
+    if mps_mode():
        return True
    if cpu_mode():
        return False
    if is_intel_xpu():
        return True
    if device is None:
        device = torch.device("cuda")
    props = torch.cuda.get_device_properties(device)
    if props.major >= 8:
        return True
@@ -902,12 +1055,33 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    bf16_works = torch.cuda.is_bf16_supported()
    if bf16_works or manual_cast:
-        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
+        free_model_memory = maximum_vram_for_weights(device)
        if (not prioritize_performance) or model_params * 4 > free_model_memory:
            return True
    return False
 def supports_fp8_compute(device=None):
    if not is_nvidia():
        return False
    props = torch.cuda.get_device_properties(device)
    if props.major >= 9:
        return True
    if props.major < 8:
        return False
    if props.minor < 9:
        return False
    if int(torch_version[0]) < 2 or (int(torch_version[0]) == 2 and int(torch_version[2]) < 3):
        return False
    if WINDOWS:
        if (int(torch_version[0]) == 2 and int(torch_version[2]) < 4):
            return False
    return True
 def soft_empty_cache(force=False):
    global cpu_state
    if cpu_state == CPUState.MPS:
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1,34 +1,47 @@
 """
    This file is part of ComfyUI.
    Copyright (C) 2024 Comfy
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import torch
 import copy
 import inspect
 import logging
 import uuid
 import collections
 import math
 import comfy.utils
 import comfy.float
 import comfy.model_management
-from comfy.types import UnetWrapperFunction
+import comfy.lora
-
+from comfy.comfy_types import UnetWrapperFunction
 def weight_decompose(dora_scale, weight, lora_diff, alpha, strength):
    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, torch.float32)
    lora_diff *= alpha
    weight_calc = weight + lora_diff.type(weight.dtype)
    weight_norm = (
        weight_calc.transpose(0, 1)
        .reshape(weight_calc.shape[1], -1)
        .norm(dim=1, keepdim=True)
        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
        .transpose(0, 1)
    )
    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
    if strength != 1.0:
        weight_calc -= weight
        weight += strength * (weight_calc)
    else:
        weight[:] = weight_calc
    return weight
 def string_to_seed(data):
    crc = 0xFFFFFFFF
    for byte in data:
        if isinstance(byte, str):
            byte = ord(byte)
        crc ^= byte
        for _ in range(8):
            if crc & 1:
                crc = (crc >> 1) ^ 0xEDB88320
            else:
                crc >>= 1
    return crc ^ 0xFFFFFFFF
 def set_model_options_patch_replace(model_options, patch, name, block_name, number, transformer_index=None):
    to = model_options["transformer_options"].copy()
@@ -63,10 +76,59 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_
        model_options["disable_cfg1_optimization"] = True
    return model_options
 def wipe_lowvram_weight(m):
    if hasattr(m, "prev_comfy_cast_weights"):
        m.comfy_cast_weights = m.prev_comfy_cast_weights
        del m.prev_comfy_cast_weights
    m.weight_function = None
    m.bias_function = None
 class LowVramPatch:
    def __init__(self, key, patches):
        self.key = key
        self.patches = patches
    def __call__(self, weight):
        intermediate_dtype = weight.dtype
        if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops
            intermediate_dtype = torch.float32
            return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key))
        return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype)
 def get_key_weight(model, key):
    set_func = None
    convert_func = None
    op_keys = key.rsplit('.', 1)
    if len(op_keys) < 2:
        weight = comfy.utils.get_attr(model, key)
    else:
        op = comfy.utils.get_attr(model, op_keys[0])
        try:
            set_func = getattr(op, "set_{}".format(op_keys[1]))
        except AttributeError:
            pass
        try:
            convert_func = getattr(op, "convert_{}".format(op_keys[1]))
        except AttributeError:
            pass
        weight = getattr(op, op_keys[1])
        if convert_func is not None:
            weight = comfy.utils.get_attr(model, key)
    return weight, set_func, convert_func
 class ModelPatcher:
-    def __init__(self, model, load_device, offload_device, size=0, current_device=None, weight_inplace_update=False):
+    def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
        self.size = size
        self.model = model
        if not hasattr(self.model, 'device'):
            logging.debug("Model doesn't have a device attribute.")
            self.model.device = offload_device
        elif self.model.device is None:
            self.model.device = offload_device
        self.patches = {}
        self.backup = {}
        self.object_patches = {}
@@ -75,24 +137,32 @@ class ModelPatcher:
        self.model_size()
        self.load_device = load_device
        self.offload_device = offload_device
        if current_device is None:
            self.current_device = self.offload_device
        else:
            self.current_device = current_device
        self.weight_inplace_update = weight_inplace_update
        self.model_lowvram = False
        self.lowvram_patch_counter = 0
        self.patches_uuid = uuid.uuid4()
        if not hasattr(self.model, 'model_loaded_weight_memory'):
            self.model.model_loaded_weight_memory = 0
        if not hasattr(self.model, 'lowvram_patch_counter'):
            self.model.lowvram_patch_counter = 0
        if not hasattr(self.model, 'model_lowvram'):
            self.model.model_lowvram = False
    def model_size(self):
        if self.size > 0:
            return self.size
        self.size = comfy.model_management.module_size(self.model)
        return self.size
    def loaded_size(self):
        return self.model.model_loaded_weight_memory
    def lowvram_patch_counter(self):
        return self.model.lowvram_patch_counter
    def clone(self):
-        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, weight_inplace_update=self.weight_inplace_update)
+        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
        n.patches = {}
        for k in self.patches:
            n.patches[k] = self.patches[k][:]
@@ -242,17 +312,23 @@ class ModelPatcher:
        return list(p)
    def get_key_patches(self, filter_prefix=None):
        comfy.model_management.unload_model_clones(self)
        model_sd = self.model_state_dict()
        p = {}
        for k in model_sd:
            if filter_prefix is not None:
                if not k.startswith(filter_prefix):
                    continue
            bk = self.backup.get(k, None)
            weight, set_func, convert_func = get_key_weight(self.model, k)
            if bk is not None:
                weight = bk.weight
            if convert_func is None:
                convert_func = lambda a, **kwargs: a
            if k in self.patches:
-                p[k] = [model_sd[k]] + self.patches[k]
+                p[k] = [(weight, convert_func)] + self.patches[k]
            else:
-                p[k] = (model_sd[k],)
+                p[k] = [(weight, convert_func)]
        return p
    def model_state_dict(self, filter_prefix=None):
@@ -264,67 +340,57 @@ class ModelPatcher:
                    sd.pop(k)
        return sd
-    def patch_weight_to_device(self, key, device_to=None):
+    def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
        if key not in self.patches:
            return
-        weight = comfy.utils.get_attr(self.model, key)
+        weight, set_func, convert_func = get_key_weight(self.model, key)
-
+        inplace_update = self.weight_inplace_update or inplace_update
        inplace_update = self.weight_inplace_update
        if key not in self.backup:
-            self.backup[key] = weight.to(device=self.offload_device, copy=inplace_update)
+            self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
        if device_to is not None:
            temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
        else:
            temp_weight = weight.to(torch.float32, copy=True)
-        out_weight = self.calculate_weight(self.patches[key], temp_weight, key).to(weight.dtype)
+        if convert_func is not None:
-        if inplace_update:
+            temp_weight = convert_func(temp_weight, inplace=True)
-            comfy.utils.copy_to_param(self.model, key, out_weight)
+
        out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
        if set_func is None:
            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
            if inplace_update:
                comfy.utils.copy_to_param(self.model, key, out_weight)
            else:
                comfy.utils.set_attr_param(self.model, key, out_weight)
        else:
-            comfy.utils.set_attr_param(self.model, key, out_weight)
+            set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key))
    def patch_model(self, device_to=None, patch_weights=True):
        for k in self.object_patches:
            old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
            if k not in self.object_patches_backup:
                self.object_patches_backup[k] = old
        if patch_weights:
            model_sd = self.model_state_dict()
            for key in self.patches:
                if key not in model_sd:
                    logging.warning("could not patch. key doesn't exist in model: {}".format(key))
                    continue
                self.patch_weight_to_device(key, device_to)
            if device_to is not None:
                self.model.to(device_to)
                self.current_device = device_to
        return self.model
    def patch_model_lowvram(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False):
        self.patch_model(device_to, patch_weights=False)
        logging.info("loading in lowvram mode {}".format(lowvram_model_memory/(1024 * 1024)))
        class LowVramPatch:
            def __init__(self, key, model_patcher):
                self.key = key
                self.model_patcher = model_patcher
            def __call__(self, weight):
                return self.model_patcher.calculate_weight(self.model_patcher.patches[self.key], weight, self.key)
    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
        mem_counter = 0
        patch_counter = 0
        lowvram_counter = 0
        loading = []
        for n, m in self.model.named_modules():
            if hasattr(m, "comfy_cast_weights") or hasattr(m, "weight"):
                loading.append((comfy.model_management.module_size(m), n, m))
        load_completely = []
        loading.sort(reverse=True)
        for x in loading:
            n = x[1]
            m = x[2]
            module_mem = x[0]
            lowvram_weight = False
-            if hasattr(m, "comfy_cast_weights"):
+
-                module_mem = comfy.model_management.module_size(m)
+            if not full_load and hasattr(m, "comfy_cast_weights"):
                if mem_counter + module_mem >= lowvram_model_memory:
                    lowvram_weight = True
                    lowvram_counter += 1
                    if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed
                        continue
            weight_key = "{}.weight".format(n)
            bias_key = "{}.bias".format(n)
@@ -334,227 +400,173 @@ class ModelPatcher:
                    if force_patch_weights:
                        self.patch_weight_to_device(weight_key)
                    else:
-                        m.weight_function = LowVramPatch(weight_key, self)
+                        m.weight_function = LowVramPatch(weight_key, self.patches)
                        patch_counter += 1
                if bias_key in self.patches:
                    if force_patch_weights:
                        self.patch_weight_to_device(bias_key)
                    else:
-                        m.bias_function = LowVramPatch(bias_key, self)
+                        m.bias_function = LowVramPatch(bias_key, self.patches)
                        patch_counter += 1
                m.prev_comfy_cast_weights = m.comfy_cast_weights
                m.comfy_cast_weights = True
            else:
                if hasattr(m, "comfy_cast_weights"):
                    if m.comfy_cast_weights:
                        wipe_lowvram_weight(m)
                if hasattr(m, "weight"):
-                    self.patch_weight_to_device(weight_key, device_to)
+                    mem_counter += module_mem
-                    self.patch_weight_to_device(bias_key, device_to)
+                    load_completely.append((module_mem, n, m))
                    m.to(device_to)
                    mem_counter += comfy.model_management.module_size(m)
                    logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
-        self.model_lowvram = True
+        load_completely.sort(reverse=True)
-        self.lowvram_patch_counter = patch_counter
+        for x in load_completely:
            n = x[1]
            m = x[2]
            weight_key = "{}.weight".format(n)
            bias_key = "{}.bias".format(n)
            if hasattr(m, "comfy_patched_weights"):
                if m.comfy_patched_weights == True:
                    continue
            self.patch_weight_to_device(weight_key, device_to=device_to)
            self.patch_weight_to_device(bias_key, device_to=device_to)
            logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
            m.comfy_patched_weights = True
        for x in load_completely:
            x[2].to(device_to)
        if lowvram_counter > 0:
            logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter))
            self.model.model_lowvram = True
        else:
            logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
            self.model.model_lowvram = False
            if full_load:
                self.model.to(device_to)
                mem_counter = self.model_size()
        self.model.lowvram_patch_counter += patch_counter
        self.model.device = device_to
        self.model.model_loaded_weight_memory = mem_counter
    def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
        for k in self.object_patches:
            old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
            if k not in self.object_patches_backup:
                self.object_patches_backup[k] = old
        if lowvram_model_memory == 0:
            full_load = True
        else:
            full_load = False
        if load_weights:
            self.load(device_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights, full_load=full_load)
        return self.model
    def calculate_weight(self, patches, weight, key):
        for p in patches:
            strength = p[0]
            v = p[1]
            strength_model = p[2]
            offset = p[3]
            function = p[4]
            if function is None:
                function = lambda a: a
            old_weight = None
            if offset is not None:
                old_weight = weight
                weight = weight.narrow(offset[0], offset[1], offset[2])
            if strength_model != 1.0:
                weight *= strength_model
            if isinstance(v, list):
                v = (self.calculate_weight(v[1:], v[0].clone(), key), )
            if len(v) == 1:
                patch_type = "diff"
            elif len(v) == 2:
                patch_type = v[0]
                v = v[1]
            if patch_type == "diff":
                w1 = v[0]
                if strength != 0.0:
                    if w1.shape != weight.shape:
                        logging.warning("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, w1.shape, weight.shape))
                    else:
                        weight += function(strength * comfy.model_management.cast_to_device(w1, weight.device, weight.dtype))
            elif patch_type == "lora": #lora/locon
                mat1 = comfy.model_management.cast_to_device(v[0], weight.device, torch.float32)
                mat2 = comfy.model_management.cast_to_device(v[1], weight.device, torch.float32)
                dora_scale = v[4]
                if v[2] is not None:
                    alpha = v[2] / mat2.shape[0]
                else:
                    alpha = 1.0
                if v[3] is not None:
                    #locon mid weights, hopefully the math is fine because I didn't properly test it
                    mat3 = comfy.model_management.cast_to_device(v[3], weight.device, torch.float32)
                    final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
                    mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
                try:
                    lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
                    if dora_scale is not None:
                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
                    else:
                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
                except Exception as e:
                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
            elif patch_type == "lokr":
                w1 = v[0]
                w2 = v[1]
                w1_a = v[3]
                w1_b = v[4]
                w2_a = v[5]
                w2_b = v[6]
                t2 = v[7]
                dora_scale = v[8]
                dim = None
                if w1 is None:
                    dim = w1_b.shape[0]
                    w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, torch.float32),
                                  comfy.model_management.cast_to_device(w1_b, weight.device, torch.float32))
                else:
                    w1 = comfy.model_management.cast_to_device(w1, weight.device, torch.float32)
                if w2 is None:
                    dim = w2_b.shape[0]
                    if t2 is None:
                        w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, torch.float32),
                                      comfy.model_management.cast_to_device(w2_b, weight.device, torch.float32))
                    else:
                        w2 = torch.einsum('i j k l, j r, i p -> p r k l',
                                          comfy.model_management.cast_to_device(t2, weight.device, torch.float32),
                                          comfy.model_management.cast_to_device(w2_b, weight.device, torch.float32),
                                          comfy.model_management.cast_to_device(w2_a, weight.device, torch.float32))
                else:
                    w2 = comfy.model_management.cast_to_device(w2, weight.device, torch.float32)
                if len(w2.shape) == 4:
                    w1 = w1.unsqueeze(2).unsqueeze(2)
                if v[2] is not None and dim is not None:
                    alpha = v[2] / dim
                else:
                    alpha = 1.0
                try:
                    lora_diff = torch.kron(w1, w2).reshape(weight.shape)
                    if dora_scale is not None:
                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
                    else:
                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
                except Exception as e:
                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
            elif patch_type == "loha":
                w1a = v[0]
                w1b = v[1]
                if v[2] is not None:
                    alpha = v[2] / w1b.shape[0]
                else:
                    alpha = 1.0
                w2a = v[3]
                w2b = v[4]
                dora_scale = v[7]
                if v[5] is not None: #cp decomposition
                    t1 = v[5]
                    t2 = v[6]
                    m1 = torch.einsum('i j k l, j r, i p -> p r k l',
                                      comfy.model_management.cast_to_device(t1, weight.device, torch.float32),
                                      comfy.model_management.cast_to_device(w1b, weight.device, torch.float32),
                                      comfy.model_management.cast_to_device(w1a, weight.device, torch.float32))
                    m2 = torch.einsum('i j k l, j r, i p -> p r k l',
                                      comfy.model_management.cast_to_device(t2, weight.device, torch.float32),
                                      comfy.model_management.cast_to_device(w2b, weight.device, torch.float32),
                                      comfy.model_management.cast_to_device(w2a, weight.device, torch.float32))
                else:
                    m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, torch.float32),
                                  comfy.model_management.cast_to_device(w1b, weight.device, torch.float32))
                    m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, torch.float32),
                                  comfy.model_management.cast_to_device(w2b, weight.device, torch.float32))
                try:
                    lora_diff = (m1 * m2).reshape(weight.shape)
                    if dora_scale is not None:
                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
                    else:
                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
                except Exception as e:
                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
            elif patch_type == "glora":
                if v[4] is not None:
                    alpha = v[4] / v[0].shape[0]
                else:
                    alpha = 1.0
                dora_scale = v[5]
                a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, torch.float32)
                a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, torch.float32)
                b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, torch.float32)
                b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, torch.float32)
                try:
                    lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1), a2), a1)).reshape(weight.shape)
                    if dora_scale is not None:
                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
                    else:
                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
                except Exception as e:
                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
            else:
                logging.warning("patch type not recognized {} {}".format(patch_type, key))
            if old_weight is not None:
                weight = old_weight
        return weight
    def unpatch_model(self, device_to=None, unpatch_weights=True):
        if unpatch_weights:
-            if self.model_lowvram:
+            if self.model.model_lowvram:
                for m in self.model.modules():
-                    if hasattr(m, "prev_comfy_cast_weights"):
+                    wipe_lowvram_weight(m)
                        m.comfy_cast_weights = m.prev_comfy_cast_weights
                        del m.prev_comfy_cast_weights
                    m.weight_function = None
                    m.bias_function = None
-                self.model_lowvram = False
+                self.model.model_lowvram = False
-                self.lowvram_patch_counter = 0
+                self.model.lowvram_patch_counter = 0
            keys = list(self.backup.keys())
-            if self.weight_inplace_update:
+            for k in keys:
-                for k in keys:
+                bk = self.backup[k]
-                    comfy.utils.copy_to_param(self.model, k, self.backup[k])
+                if bk.inplace_update:
-            else:
+                    comfy.utils.copy_to_param(self.model, k, bk.weight)
-                for k in keys:
+                else:
-                    comfy.utils.set_attr_param(self.model, k, self.backup[k])
+                    comfy.utils.set_attr_param(self.model, k, bk.weight)
            self.backup.clear()
            if device_to is not None:
                self.model.to(device_to)
-                self.current_device = device_to
+                self.model.device = device_to
            self.model.model_loaded_weight_memory = 0
            for m in self.model.modules():
                if hasattr(m, "comfy_patched_weights"):
                    del m.comfy_patched_weights
        keys = list(self.object_patches_backup.keys())
        for k in keys:
            comfy.utils.set_attr(self.model, k, self.object_patches_backup[k])
        self.object_patches_backup.clear()
    def partially_unload(self, device_to, memory_to_free=0):
        memory_freed = 0
        patch_counter = 0
        unload_list = []
        for n, m in self.model.named_modules():
            shift_lowvram = False
            if hasattr(m, "comfy_cast_weights"):
                module_mem = comfy.model_management.module_size(m)
                unload_list.append((module_mem, n, m))
        unload_list.sort()
        for unload in unload_list:
            if memory_to_free < memory_freed:
                break
            module_mem = unload[0]
            n = unload[1]
            m = unload[2]
            weight_key = "{}.weight".format(n)
            bias_key = "{}.bias".format(n)
            if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
                for key in [weight_key, bias_key]:
                    bk = self.backup.get(key, None)
                    if bk is not None:
                        if bk.inplace_update:
                            comfy.utils.copy_to_param(self.model, key, bk.weight)
                        else:
                            comfy.utils.set_attr_param(self.model, key, bk.weight)
                        self.backup.pop(key)
                m.to(device_to)
                if weight_key in self.patches:
                    m.weight_function = LowVramPatch(weight_key, self.patches)
                    patch_counter += 1
                if bias_key in self.patches:
                    m.bias_function = LowVramPatch(bias_key, self.patches)
                    patch_counter += 1
                m.prev_comfy_cast_weights = m.comfy_cast_weights
                m.comfy_cast_weights = True
                m.comfy_patched_weights = False
                memory_freed += module_mem
                logging.debug("freed {}".format(n))
        self.model.model_lowvram = True
        self.model.lowvram_patch_counter += patch_counter
        self.model.model_loaded_weight_memory -= memory_freed
        return memory_freed
    def partially_load(self, device_to, extra_memory=0):
        self.unpatch_model(unpatch_weights=False)
        self.patch_model(load_weights=False)
        full_load = False
        if self.model.model_lowvram == False:
            return 0
        if self.model.model_loaded_weight_memory + extra_memory > self.model_size():
            full_load = True
        current_used = self.model.model_loaded_weight_memory
        self.load(device_to, lowvram_model_memory=current_used + extra_memory, full_load=full_load)
        return self.model.model_loaded_weight_memory - current_used
    def current_loaded_device(self):
        return self.model.device
    def calculate_weight(self, patches, weight, key, intermediate_dtype=torch.float32):
        print("WARNING the ModelPatcher.calculate_weight function is deprecated, please use: comfy.lora.calculate_weight instead")
        return comfy.lora.calculate_weight(patches, weight, key, intermediate_dtype=intermediate_dtype)
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@@ -59,8 +59,9 @@ class ModelSamplingDiscrete(torch.nn.Module):
        beta_schedule = sampling_settings.get("beta_schedule", "linear")
        linear_start = sampling_settings.get("linear_start", 0.00085)
        linear_end = sampling_settings.get("linear_end", 0.012)
        timesteps = sampling_settings.get("timesteps", 1000)
-        self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=1000, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3)
+        self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3)
        self.sigma_data = 1.0
    def _register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
@@ -271,3 +272,43 @@ class StableCascadeSampling(ModelSamplingDiscrete):
        percent = 1.0 - percent
        return self.sigma(torch.tensor(percent))
 def flux_time_shift(mu: float, sigma: float, t):
    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
 class ModelSamplingFlux(torch.nn.Module):
    def __init__(self, model_config=None):
        super().__init__()
        if model_config is not None:
            sampling_settings = model_config.sampling_settings
        else:
            sampling_settings = {}
        self.set_parameters(shift=sampling_settings.get("shift", 1.15))
    def set_parameters(self, shift=1.15, timesteps=10000):
        self.shift = shift
        ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps))
        self.register_buffer('sigmas', ts)
    @property
    def sigma_min(self):
        return self.sigmas[0]
    @property
    def sigma_max(self):
        return self.sigmas[-1]
    def timestep(self, sigma):
        return sigma
    def sigma(self, timestep):
        return flux_time_shift(self.shift, 1.0, timestep)
    def percent_to_sigma(self, percent):
        if percent <= 0.0:
            return 1.0
        if percent >= 1.0:
            return 0.0
        return 1.0 - percent
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -18,16 +18,34 @@
 import torch
 import comfy.model_management
 from comfy.cli_args import args
 import comfy.float
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 def cast_to_input(weight, input, non_blocking=False, copy=True):
    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
    if input is not None:
        if dtype is None:
            dtype = input.dtype
        if bias_dtype is None:
            bias_dtype = dtype
        if device is None:
            device = input.device
 def cast_bias_weight(s, input):
    bias = None
-    non_blocking = comfy.model_management.device_should_use_non_blocking(input.device)
+    non_blocking = comfy.model_management.device_supports_non_blocking(device)
    if s.bias is not None:
-        bias = s.bias.to(device=input.device, dtype=input.dtype, non_blocking=non_blocking)
+        has_function = s.bias_function is not None
-        if s.bias_function is not None:
+        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function)
        if has_function:
            bias = s.bias_function(bias)
-    weight = s.weight.to(device=input.device, dtype=input.dtype, non_blocking=non_blocking)
+
-    if s.weight_function is not None:
+    has_function = s.weight_function is not None
    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function)
    if has_function:
        weight = s.weight_function(weight)
    return weight, bias
@@ -168,6 +186,26 @@ class disable_weight_init:
            else:
                return super().forward(*args, **kwargs)
    class Embedding(torch.nn.Embedding, CastWeightBiasOp):
        def reset_parameters(self):
            self.bias = None
            return None
        def forward_comfy_cast_weights(self, input, out_dtype=None):
            output_dtype = out_dtype
            if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16:
                out_dtype = None
            weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype)
            return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)
        def forward(self, *args, **kwargs):
            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                if "out_dtype" in kwargs:
                    kwargs.pop("out_dtype")
                return super().forward(*args, **kwargs)
    @classmethod
    def conv_nd(s, dims, *args, **kwargs):
        if dims == 2:
@@ -202,3 +240,123 @@ class manual_cast(disable_weight_init):
    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
        comfy_cast_weights = True
    class Embedding(disable_weight_init.Embedding):
        comfy_cast_weights = True
 def fp8_linear(self, input):
    dtype = self.weight.dtype
    if dtype not in [torch.float8_e4m3fn]:
        return None
    tensor_2d = False
    if len(input.shape) == 2:
        tensor_2d = True
        input = input.unsqueeze(1)
    if len(input.shape) == 3:
        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input.dtype)
        w = w.t()
        scale_weight = self.scale_weight
        scale_input = self.scale_input
        if scale_weight is None:
            scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
        if scale_input is None:
            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
            inn = input.reshape(-1, input.shape[2]).to(dtype)
        else:
            inn = (input * (1.0 / scale_input).to(input.dtype)).reshape(-1, input.shape[2]).to(dtype)
        if bias is not None:
            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
        else:
            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, scale_a=scale_input, scale_b=scale_weight)
        if isinstance(o, tuple):
            o = o[0]
        if tensor_2d:
            return o.reshape(input.shape[0], -1)
        return o.reshape((-1, input.shape[1], self.weight.shape[0]))
    return None
 class fp8_ops(manual_cast):
    class Linear(manual_cast.Linear):
        def reset_parameters(self):
            self.scale_weight = None
            self.scale_input = None
            return None
        def forward_comfy_cast_weights(self, input):
            out = fp8_linear(self, input)
            if out is not None:
                return out
            weight, bias = cast_bias_weight(self, input)
            return torch.nn.functional.linear(input, weight, bias)
 def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
    class scaled_fp8_op(manual_cast):
        class Linear(manual_cast.Linear):
            def __init__(self, *args, **kwargs):
                if override_dtype is not None:
                    kwargs['dtype'] = override_dtype
                super().__init__(*args, **kwargs)
            def reset_parameters(self):
                if not hasattr(self, 'scale_weight'):
                    self.scale_weight = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
                if not scale_input:
                    self.scale_input = None
                if not hasattr(self, 'scale_input'):
                    self.scale_input = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
                return None
            def forward_comfy_cast_weights(self, input):
                if fp8_matrix_mult:
                    out = fp8_linear(self, input)
                    if out is not None:
                        return out
                weight, bias = cast_bias_weight(self, input)
                if weight.numel() < input.numel(): #TODO: optimize
                    return torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias)
                else:
                    return torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias)
            def convert_weight(self, weight, inplace=False, **kwargs):
                if inplace:
                    weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype)
                    return weight
                else:
                    return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype)
            def set_weight(self, weight, inplace_update=False, seed=None, **kwargs):
                weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed)
                if inplace_update:
                    self.weight.data.copy_(weight)
                else:
                    self.weight = torch.nn.Parameter(weight, requires_grad=False)
    return scaled_fp8_op
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None):
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
    if scaled_fp8 is not None:
        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute, scale_input=True, override_dtype=scaled_fp8)
    if fp8_compute and (fp8_optimizations or args.fast) and not disable_fast_fp8:
        return fp8_ops
    if compute_dtype is None or weight_dtype == compute_dtype:
        return disable_weight_init
    return manual_cast
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -61,7 +61,9 @@ def prepare_sampling(model, noise_shape, conds):
    device = model.load_device
    real_model = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
-    comfy.model_management.load_models_gpu([model] + models, model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory)
+    memory_required = model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory
    minimum_memory_required = model.memory_required([noise_shape[0]] + list(noise_shape[1:])) + inference_memory
    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required)
    real_model = model.model
    return real_model, conds, models
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -6,6 +6,8 @@ from comfy import model_management
 import math
 import logging
 import comfy.sampler_helpers
 import scipy.stats
 import numpy
 def get_area_and_mult(conds, x_in, timestep_in):
    dims = tuple(x_in.shape[2:])
@@ -169,7 +171,7 @@ def calc_cond_batch(model, conds, x_in, timestep, model_options):
        for i in range(1, len(to_batch_temp) + 1):
            batch_amount = to_batch_temp[:len(to_batch_temp)//i]
            input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
-            if model.memory_required(input_shape) < free_memory:
+            if model.memory_required(input_shape) * 1.5 < free_memory:
                to_batch = batch_amount
                break
@@ -311,13 +313,18 @@ def simple_scheduler(model_sampling, steps):
 def ddim_scheduler(model_sampling, steps):
    s = model_sampling
    sigs = []
    ss = max(len(s.sigmas) // steps, 1)
    x = 1
    if math.isclose(float(s.sigmas[x]), 0, abs_tol=0.00001):
        steps += 1
        sigs = []
    else:
        sigs = [0.0]
    ss = max(len(s.sigmas) // steps, 1)
    while x < len(s.sigmas):
        sigs += [float(s.sigmas[x])]
        x += ss
    sigs = sigs[::-1]
    sigs += [0.0]
    return torch.FloatTensor(sigs)
 def normal_scheduler(model_sampling, steps, sgm=False, floor=False):
@@ -325,15 +332,37 @@ def normal_scheduler(model_sampling, steps, sgm=False, floor=False):
    start = s.timestep(s.sigma_max)
    end = s.timestep(s.sigma_min)
    append_zero = True
    if sgm:
        timesteps = torch.linspace(start, end, steps + 1)[:-1]
    else:
        if math.isclose(float(s.sigma(end)), 0, abs_tol=0.00001):
            steps += 1
            append_zero = False
        timesteps = torch.linspace(start, end, steps)
    sigs = []
    for x in range(len(timesteps)):
        ts = timesteps[x]
-        sigs.append(s.sigma(ts))
+        sigs.append(float(s.sigma(ts)))
    if append_zero:
        sigs += [0.0]
    return torch.FloatTensor(sigs)
 # Implemented based on: https://arxiv.org/abs/2407.12173
 def beta_scheduler(model_sampling, steps, alpha=0.6, beta=0.6):
    total_timesteps = (len(model_sampling.sigmas) - 1)
    ts = 1 - numpy.linspace(0, 1, steps, endpoint=False)
    ts = numpy.rint(scipy.stats.beta.ppf(ts, alpha, beta) * total_timesteps)
    sigs = []
    last_t = -1
    for t in ts:
        if t != last_t:
            sigs += [float(model_sampling.sigmas[int(t)])]
        last_t = t
    sigs += [0.0]
    return torch.FloatTensor(sigs)
@@ -544,8 +573,8 @@ class Sampler:
        return math.isclose(max_sigma, sigma, rel_tol=1e-05) or sigma > max_sigma
 KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral",
-                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde", "dpmpp_sde_gpu",
+                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
-                  "dpmpp_2m", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
+                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
                  "ipndm", "ipndm_v", "deis"]
 class KSAMPLER(Sampler):
@@ -703,7 +732,7 @@ def sample(model, noise, positive, negative, cfg, device, sampler, sigmas, model
    return cfg_guider.sample(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed)
-SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform"]
+SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform", "beta"]
 SAMPLER_NAMES = KSAMPLER_NAMES + ["ddim", "uni_pc", "uni_pc_bh2"]
 def calculate_sigmas(model_sampling, scheduler_name, steps):
@@ -719,6 +748,8 @@ def calculate_sigmas(model_sampling, scheduler_name, steps):
        sigmas = ddim_scheduler(model_sampling, steps)
    elif scheduler_name == "sgm_uniform":
        sigmas = normal_scheduler(model_sampling, steps, sgm=True)
    elif scheduler_name == "beta":
        sigmas = beta_scheduler(model_sampling, steps)
    else:
        logging.error("error invalid scheduler {}".format(scheduler_name))
    return sigmas
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -17,16 +17,18 @@ from . import diffusers_convert
 from . import model_detection
 from . import sd1_clip
 from . import sd2_clip
 from . import sdxl_clip
-from . import sd3_clip
+import comfy.text_encoders.sd2_clip
-from . import sa_t5
+import comfy.text_encoders.sd3_clip
 import comfy.text_encoders.sa_t5
 import comfy.text_encoders.aura_t5
 import comfy.text_encoders.hydit
 import comfy.text_encoders.flux
 import comfy.text_encoders.long_clipl
 import comfy.model_patcher
 import comfy.lora
 import comfy.t2i_adapter.adapter
 import comfy.supported_models_base
 import comfy.taesd.taesd
 def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
@@ -60,29 +62,38 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
 class CLIP:
-    def __init__(self, target=None, embedding_directory=None, no_init=False):
+    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}):
        if no_init:
            return
        params = target.params.copy()
        clip = target.clip
        tokenizer = target.tokenizer
-        load_device = model_management.text_encoder_device()
+        load_device = model_options.get("load_device", model_management.text_encoder_device())
-        offload_device = model_management.text_encoder_offload_device()
+        offload_device = model_options.get("offload_device", model_management.text_encoder_offload_device())
-        params['device'] = offload_device
+        dtype = model_options.get("dtype", None)
-        dtype = model_management.text_encoder_dtype(load_device)
+        if dtype is None:
            dtype = model_management.text_encoder_dtype(load_device)
        params['dtype'] = dtype
        params['device'] = model_options.get("initial_device", model_management.text_encoder_initial_device(load_device, offload_device, parameters * model_management.dtype_size(dtype)))
        params['model_options'] = model_options
        self.cond_stage_model = clip(**(params))
        for dt in self.cond_stage_model.dtypes:
            if not model_management.supports_cast(load_device, dt):
                load_device = offload_device
                if params['device'] != offload_device:
                    self.cond_stage_model.to(offload_device)
                    logging.warning("Had to shift TE back.")
-        self.tokenizer = tokenizer(embedding_directory=embedding_directory)
+        self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
        if params['device'] == load_device:
            model_management.load_models_gpu([self.patcher], force_full_load=True)
        self.layer_idx = None
-        logging.debug("CLIP model load device: {}, offload device: {}".format(load_device, offload_device))
+        logging.debug("CLIP model load device: {}, offload device: {}, current: {}".format(load_device, offload_device, params['device']))
    def clone(self):
        n = CLIP(no_init=True)
@@ -135,7 +146,11 @@ class CLIP:
            return self.cond_stage_model.load_sd(sd)
    def get_sd(self):
-        return self.cond_stage_model.state_dict()
+        sd_clip = self.cond_stage_model.state_dict()
        sd_tokenizer = self.tokenizer.state_dict()
        for k in sd_tokenizer:
            sd_clip[k] = sd_tokenizer[k]
        return sd_clip
    def load_model(self):
        model_management.load_model_gpu(self.patcher)
@@ -332,7 +347,7 @@ class VAE:
            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
-            batch_number = int(free_memory / memory_used)
+            batch_number = int(free_memory / max(1, memory_used))
            batch_number = max(1, batch_number)
            samples = torch.empty((pixel_samples.shape[0], self.latent_channels) + tuple(map(lambda a: a // self.downscale_ratio, pixel_samples.shape[2:])), device=self.output_device)
            for x in range(0, pixel_samples.shape[0], batch_number):
@@ -381,11 +396,55 @@ class CLIPType(Enum):
    STABLE_CASCADE = 2
    SD3 = 3
    STABLE_AUDIO = 4
    HUNYUAN_DIT = 5
    FLUX = 6
-def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION):
+def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
    clip_data = []
    for p in ckpt_paths:
        clip_data.append(comfy.utils.load_torch_file(p, safe_load=True))
    return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
 class TEModel(Enum):
    CLIP_L = 1
    CLIP_H = 2
    CLIP_G = 3
    T5_XXL = 4
    T5_XL = 5
    T5_BASE = 6
 def detect_te_model(sd):
    if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
        return TEModel.CLIP_G
    if "text_model.encoder.layers.22.mlp.fc1.weight" in sd:
        return TEModel.CLIP_H
    if "text_model.encoder.layers.0.mlp.fc1.weight" in sd:
        return TEModel.CLIP_L
    if "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in sd:
        weight = sd["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"]
        if weight.shape[-1] == 4096:
            return TEModel.T5_XXL
        elif weight.shape[-1] == 2048:
            return TEModel.T5_XL
    if "encoder.block.0.layer.0.SelfAttention.k.weight" in sd:
        return TEModel.T5_BASE
    return None
 def t5xxl_detect(clip_data):
    weight_name = "encoder.block.23.layer.1.DenseReluDense.wi_1.weight"
    dtype_t5 = None
    for sd in clip_data:
        if weight_name in sd:
            return comfy.text_encoders.sd3_clip.t5_xxl_detect(sd)
    return {}
 def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
    clip_data = state_dicts
    class EmptyClass:
        pass
@@ -400,43 +459,61 @@ def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DI
    clip_target = EmptyClass()
    clip_target.params = {}
    if len(clip_data) == 1:
-        if "text_model.encoder.layers.30.mlp.fc1.weight" in clip_data[0]:
+        te_model = detect_te_model(clip_data[0])
        if te_model == TEModel.CLIP_G:
            if clip_type == CLIPType.STABLE_CASCADE:
                clip_target.clip = sdxl_clip.StableCascadeClipModel
                clip_target.tokenizer = sdxl_clip.StableCascadeTokenizer
            elif clip_type == CLIPType.SD3:
                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
            else:
                clip_target.clip = sdxl_clip.SDXLRefinerClipModel
                clip_target.tokenizer = sdxl_clip.SDXLTokenizer
-        elif "text_model.encoder.layers.22.mlp.fc1.weight" in clip_data[0]:
+        elif te_model == TEModel.CLIP_H:
-            clip_target.clip = sd2_clip.SD2ClipModel
+            clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel
-            clip_target.tokenizer = sd2_clip.SD2Tokenizer
+            clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer
-        elif "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in clip_data[0]:
+        elif te_model == TEModel.T5_XXL:
-            weight = clip_data[0]["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"]
+            clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **t5xxl_detect(clip_data))
-            dtype_t5 = weight.dtype
+            clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
-            if weight.shape[-1] == 4096:
+        elif te_model == TEModel.T5_XL:
-                clip_target.clip = sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, dtype_t5=dtype_t5)
+            clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model
-                clip_target.tokenizer = sd3_clip.SD3Tokenizer
+            clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer
-            elif weight.shape[-1] == 2048:
+        elif te_model == TEModel.T5_BASE:
-                clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model
+            clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
-                clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer
+            clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
        elif "encoder.block.0.layer.0.SelfAttention.k.weight" in clip_data[0]:
            clip_target.clip = sa_t5.SAT5Model
            clip_target.tokenizer = sa_t5.SAT5Tokenizer
        else:
-            clip_target.clip = sd1_clip.SD1ClipModel
+            if clip_type == CLIPType.SD3:
-            clip_target.tokenizer = sd1_clip.SD1Tokenizer
+                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
            else:
                clip_target.clip = sd1_clip.SD1ClipModel
                clip_target.tokenizer = sd1_clip.SD1Tokenizer
    elif len(clip_data) == 2:
        if clip_type == CLIPType.SD3:
-            clip_target.clip = sd3_clip.sd3_clip(clip_l=True, clip_g=True, t5=False)
+            te_models = [detect_te_model(clip_data[0]), detect_te_model(clip_data[1])]
-            clip_target.tokenizer = sd3_clip.SD3Tokenizer
+            clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=TEModel.CLIP_L in te_models, clip_g=TEModel.CLIP_G in te_models, t5=TEModel.T5_XXL in te_models, **t5xxl_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
        elif clip_type == CLIPType.HUNYUAN_DIT:
            clip_target.clip = comfy.text_encoders.hydit.HyditModel
            clip_target.tokenizer = comfy.text_encoders.hydit.HyditTokenizer
        elif clip_type == CLIPType.FLUX:
            clip_target.clip = comfy.text_encoders.flux.flux_clip(**t5xxl_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.flux.FluxTokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
    elif len(clip_data) == 3:
-        clip_target.clip = sd3_clip.SD3ClipModel
+        clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(**t5xxl_detect(clip_data))
-        clip_target.tokenizer = sd3_clip.SD3Tokenizer
+        clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
-    clip = CLIP(clip_target, embedding_directory=embedding_directory)
+    parameters = 0
    tokenizer_data = {}
    for c in clip_data:
        parameters += comfy.utils.calculate_parameters(c)
        tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
    clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options)
    for c in clip_data:
        m, u = clip.load_sd(c)
        if len(m) > 0:
@@ -478,25 +555,39 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
    return (model, clip, vae)
-def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True):
+def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    sd = comfy.utils.load_torch_file(ckpt_path)
-    sd_keys = sd.keys()
+    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
    return out
 def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    clip = None
    clipvision = None
    vae = None
    model = None
    model_patcher = None
    clip_target = None
    diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
    parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
    load_device = model_management.get_torch_device()
    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
    if model_config is None:
-        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
+        return None
    unet_weight_dtype = list(model_config.supported_inference_dtypes)
    if weight_dtype is not None and model_config.scaled_fp8 is None:
        unet_weight_dtype.append(weight_dtype)
    model_config.custom_operations = model_options.get("custom_operations", None)
    unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
    if unet_dtype is None:
        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)
    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
@@ -506,7 +597,6 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
    if output_model:
        inital_load_device = model_management.unet_inital_load_device(parameters, unet_dtype)
        offload_device = model_management.unet_offload_device()
        model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
        model.load_model_weights(sd, diffusion_model_prefix)
@@ -520,7 +610,8 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
        if clip_target is not None:
            clip_sd = model_config.process_clip_state_dict(sd)
            if len(clip_sd) > 0:
-                clip = CLIP(clip_target, embedding_directory=embedding_directory)
+                parameters = comfy.utils.calculate_parameters(clip_sd)
                clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, model_options=te_model_options)
                m, u = clip.load_sd(clip_sd, full_model=True)
                if len(m) > 0:
                    m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m))
@@ -539,15 +630,16 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
        logging.debug("left over keys: {}".format(left_over))
    if output_model:
-        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device(), current_device=inital_load_device)
+        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device())
        if inital_load_device != torch.device("cpu"):
            logging.info("loaded straight to GPU")
-            model_management.load_model_gpu(model_patcher)
+            model_management.load_models_gpu([model_patcher], force_full_load=True)
    return (model_patcher, clip, vae, clipvision)
-def load_unet_state_dict(sd): #load unet in diffusers or regular format
+def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffusers or regular format
    dtype = model_options.get("dtype", None)
    #Allow loading unets from checkpoint files
    diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
@@ -556,37 +648,49 @@ def load_unet_state_dict(sd): #load unet in diffusers or regular format
        sd = temp_sd
    parameters = comfy.utils.calculate_parameters(sd)
-    unet_dtype = model_management.unet_dtype(model_params=parameters)
+    weight_dtype = comfy.utils.weight_dtype(sd)
    load_device = model_management.get_torch_device()
    model_config = model_detection.model_config_from_unet(sd, "")
    if model_config is not None:
        new_sd = sd
-    elif 'transformer_blocks.0.attn.add_q_proj.weight' in sd: #MMDIT SD3
+    else:
        new_sd = model_detection.convert_diffusers_mmdit(sd, "")
-        if new_sd is None:
+        if new_sd is not None: #diffusers mmdit
-            return None
+            model_config = model_detection.model_config_from_unet(new_sd, "")
-        model_config = model_detection.model_config_from_unet(new_sd, "")
+            if model_config is None:
-        if model_config is None:
+                return None
-            return None
+        else: #diffusers unet
-    else: #diffusers
+            model_config = model_detection.model_config_from_diffusers_unet(sd)
-        model_config = model_detection.model_config_from_diffusers_unet(sd)
+            if model_config is None:
-        if model_config is None:
+                return None
            return None
-        diffusers_keys = comfy.utils.unet_to_diffusers(model_config.unet_config)
+            diffusers_keys = comfy.utils.unet_to_diffusers(model_config.unet_config)
-        new_sd = {}
+            new_sd = {}
-        for k in diffusers_keys:
+            for k in diffusers_keys:
-            if k in sd:
+                if k in sd:
-                new_sd[diffusers_keys[k]] = sd.pop(k)
+                    new_sd[diffusers_keys[k]] = sd.pop(k)
-            else:
+                else:
-                logging.warning("{} {}".format(diffusers_keys[k], k))
+                    logging.warning("{} {}".format(diffusers_keys[k], k))
    offload_device = model_management.unet_offload_device()
-    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
+    unet_weight_dtype = list(model_config.supported_inference_dtypes)
    if weight_dtype is not None and model_config.scaled_fp8 is None:
        unet_weight_dtype.append(weight_dtype)
    if dtype is None:
        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)
    else:
        unet_dtype = dtype
    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
    model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations)
    if model_options.get("fp8_optimizations", False):
        model_config.optimizations["fp8"] = True
    model = model_config.get_model(new_sd, "")
    model = model.to(offload_device)
    model.load_model_weights(new_sd, "")
@@ -595,24 +699,36 @@ def load_unet_state_dict(sd): #load unet in diffusers or regular format
        logging.info("left over keys in unet: {}".format(left_over))
    return comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=offload_device)
-def load_unet(unet_path):
+
 def load_diffusion_model(unet_path, model_options={}):
    sd = comfy.utils.load_torch_file(unet_path)
-    model = load_unet_state_dict(sd)
+    model = load_diffusion_model_state_dict(sd, model_options=model_options)
    if model is None:
        logging.error("ERROR UNSUPPORTED UNET {}".format(unet_path))
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path))
    return model
 def load_unet(unet_path, dtype=None):
    print("WARNING: the load_unet function has been deprecated and will be removed please switch to: load_diffusion_model")
    return load_diffusion_model(unet_path, model_options={"dtype": dtype})
 def load_unet_state_dict(sd, dtype=None):
    print("WARNING: the load_unet_state_dict function has been deprecated and will be removed please switch to: load_diffusion_model_state_dict")
    return load_diffusion_model_state_dict(sd, model_options={"dtype": dtype})
 def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, metadata=None, extra_keys={}):
    clip_sd = None
    load_models = [model]
    if clip is not None:
        load_models.append(clip.load_model())
        clip_sd = clip.get_sd()
    vae_sd = None
    if vae is not None:
        vae_sd = vae.get_sd()
    model_management.load_models_gpu(load_models, force_patch_weights=True)
    clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None
-    sd = model.model.state_dict_for_saving(clip_sd, vae.get_sd(), clip_vision_sd)
+    sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
    for k in extra_keys:
        sd[k] = extra_keys[k]
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -75,16 +75,15 @@ class ClipTokenWeightEncoder:
        return r
 class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    """Uses the CLIP transformer encoder for text (from huggingface)"""
    LAYERS = [
        "last",
        "pooled",
        "hidden"
    ]
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77,
+    def __init__(self, device="cpu", max_length=77,
                 freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=comfy.clip_model.CLIPTextModel,
                 special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False,
-                 return_projected_pooled=True, return_attention_masks=False):  # clip-vit-base-patch32
+                 return_projected_pooled=True, return_attention_masks=False, model_options={}):  # clip-vit-base-patch32
        super().__init__()
        assert layer in self.LAYERS
@@ -94,7 +93,21 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        with open(textmodel_json_config) as f:
            config = json.load(f)
-        self.transformer = model_class(config, dtype, device, comfy.ops.manual_cast)
+        operations = model_options.get("custom_operations", None)
        scaled_fp8 = None
        if operations is None:
            scaled_fp8 = model_options.get("scaled_fp8", None)
            if scaled_fp8 is not None:
                operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
            else:
                operations = comfy.ops.manual_cast
        self.operations = operations
        self.transformer = model_class(config, dtype, device, self.operations)
        if scaled_fp8 is not None:
            self.transformer.scaled_fp8 = torch.nn.Parameter(torch.tensor([], dtype=scaled_fp8))
        self.num_layers = self.transformer.num_layers
        self.max_length = max_length
@@ -140,15 +153,13 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    def set_up_textual_embeddings(self, tokens, current_embeds):
        out_tokens = []
-        next_new_token = token_dict_size = current_embeds.weight.shape[0] - 1
+        next_new_token = token_dict_size = current_embeds.weight.shape[0]
        embedding_weights = []
        for x in tokens:
            tokens_temp = []
            for y in x:
                if isinstance(y, numbers.Integral):
                    if y == token_dict_size: #EOS token
                        y = -1
                    tokens_temp += [int(y)]
                else:
                    if y.shape[0] == current_embeds.weight.shape[1]:
@@ -163,12 +174,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        n = token_dict_size
        if len(embedding_weights) > 0:
-            new_embedding = torch.nn.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
+            new_embedding = self.operations.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
-            new_embedding.weight[:token_dict_size] = current_embeds.weight[:-1]
+            new_embedding.weight[:token_dict_size] = current_embeds.weight
            for x in embedding_weights:
                new_embedding.weight[n] = x
                n += 1
            new_embedding.weight[n] = current_embeds.weight[-1] #EOS embedding
            self.transformer.set_input_embeddings(new_embedding)
        processed_tokens = []
@@ -197,7 +207,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        if self.enable_attention_masks:
            attention_mask_model = attention_mask
-        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state)
+        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
        self.transformer.set_input_embeddings(backup_embeds)
        if self.layer == "last":
@@ -315,6 +325,17 @@ def expand_directory_list(directories):
            dirs.add(root)
    return list(dirs)
 def bundled_embed(embed, prefix, suffix): #bundled embedding in lora format
    i = 0
    out_list = []
    for k in embed:
        if k.startswith(prefix) and k.endswith(suffix):
            out_list.append(embed[k])
    if len(out_list) == 0:
        return None
    return torch.cat(out_list, dim=0)
 def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=None):
    if isinstance(embedding_directory, str):
        embedding_directory = [embedding_directory]
@@ -381,12 +402,16 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
        elif embed_key is not None and embed_key in embed:
            embed_out = embed[embed_key]
        else:
-            values = embed.values()
+            embed_out = bundled_embed(embed, 'bundle_emb.', '.string_to_param.*')
-            embed_out = next(iter(values))
+            if embed_out is None:
                embed_out = bundled_embed(embed, 'bundle_emb.', '.{}'.format(embed_key))
            if embed_out is None:
                values = embed.values()
                embed_out = next(iter(values))
    return embed_out
 class SDTokenizer:
-    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True, min_length=None, pad_token=None):
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True, min_length=None, pad_token=None, tokenizer_data={}):
        if tokenizer_path is None:
            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
@@ -519,12 +544,15 @@ class SDTokenizer:
    def untokenize(self, token_weight_pair):
        return list(map(lambda a: (a, self.inv_vocab[a[0]]), token_weight_pair))
    def state_dict(self):
        return {}
 class SD1Tokenizer:
-    def __init__(self, embedding_directory=None, clip_name="l", tokenizer=SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}, clip_name="l", tokenizer=SDTokenizer):
        self.clip_name = clip_name
        self.clip = "clip_{}".format(self.clip_name)
-        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory))
+        tokenizer = tokenizer_data.get("{}_tokenizer_class".format(self.clip), tokenizer)
        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data))
    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
@@ -534,9 +562,15 @@ class SD1Tokenizer:
    def untokenize(self, token_weight_pair):
        return getattr(self, self.clip).untokenize(token_weight_pair)
    def state_dict(self):
        return {}
 class SD1CheckpointClipModel(SDClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__(device=device, return_projected_pooled=False, dtype=dtype, model_options=model_options)
 class SD1ClipModel(torch.nn.Module):
-    def __init__(self, device="cpu", dtype=None, clip_name="l", clip_model=SDClipModel, name=None, **kwargs):
+    def __init__(self, device="cpu", dtype=None, model_options={}, clip_name="l", clip_model=SD1CheckpointClipModel, name=None, **kwargs):
        super().__init__()
        if name is not None:
@@ -546,7 +580,8 @@ class SD1ClipModel(torch.nn.Module):
            self.clip_name = clip_name
            self.clip = "clip_{}".format(self.clip_name)
-        setattr(self, self.clip, clip_model(device=device, dtype=dtype, **kwargs))
+        clip_model = model_options.get("{}_class".format(self.clip), clip_model)
        setattr(self, self.clip, clip_model(device=device, dtype=dtype, model_options=model_options, **kwargs))
        self.dtypes = set()
        if dtype is not None:
--- a/comfy/sd1_clip_config.json
+++ b/comfy/sd1_clip_config.json
@@ -6,7 +6,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 2,
+  "eos_token_id": 49407,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "initializer_factor": 1.0,
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@@ -3,26 +3,27 @@ import torch
 import os
 class SDXLClipG(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None, model_options={}):
        if layer == "penultimate":
            layer="hidden"
            layer_idx=-2
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
-                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)
+                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False, return_projected_pooled=True, model_options=model_options)
    def load_sd(self, sd):
        return super().load_sd(sd)
 class SDXLClipGTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, tokenizer_path=None, embedding_directory=None):
+    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')
 class SDXLTokenizer:
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory)
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)
    def tokenize_with_weights(self, text:str, return_word_ids=False):
@@ -34,11 +35,15 @@ class SDXLTokenizer:
    def untokenize(self, token_weight_pair):
        return self.clip_g.untokenize(token_weight_pair)
    def state_dict(self):
        return {}
 class SDXLClipModel(torch.nn.Module):
-    def __init__(self, device="cpu", dtype=None):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__()
-        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
-        self.clip_g = SDXLClipG(device=device, dtype=dtype)
+        self.clip_l = clip_l_class(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, model_options=model_options)
        self.clip_g = SDXLClipG(device=device, dtype=dtype, model_options=model_options)
        self.dtypes = set([dtype])
    def set_clip_options(self, options):
@@ -54,7 +59,8 @@ class SDXLClipModel(torch.nn.Module):
        token_weight_pairs_l = token_weight_pairs["l"]
        g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
-        return torch.cat([l_out, g_out], dim=-1), g_pooled
+        cut_to = min(l_out.shape[1], g_out.shape[1])
        return torch.cat([l_out[:,:cut_to], g_out[:,:cut_to]], dim=-1), g_pooled
    def load_sd(self, sd):
        if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@@ -63,27 +69,27 @@ class SDXLClipModel(torch.nn.Module):
            return self.clip_l.load_sd(sd)
 class SDXLRefinerClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
-        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG)
+        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG, model_options=model_options)
 class StableCascadeClipGTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, tokenizer_path=None, embedding_directory=None):
+    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
        super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')
 class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, clip_name="g", tokenizer=StableCascadeClipGTokenizer)
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="g", tokenizer=StableCascadeClipGTokenizer)
 class StableCascadeClipG(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
-                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True)
+                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True, return_projected_pooled=True, model_options=model_options)
    def load_sd(self, sd):
        return super().load_sd(sd)
 class StableCascadeClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
-        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG)
+        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG, model_options=model_options)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -3,11 +3,13 @@ from . import model_base
 from . import utils
 from . import sd1_clip
 from . import sd2_clip
 from . import sdxl_clip
-from . import sd3_clip
+import comfy.text_encoders.sd2_clip
-from . import sa_t5
+import comfy.text_encoders.sd3_clip
 import comfy.text_encoders.sa_t5
 import comfy.text_encoders.aura_t5
 import comfy.text_encoders.hydit
 import comfy.text_encoders.flux
 from . import supported_models_base
 from . import latent_formats
@@ -29,6 +31,7 @@ class SD15(supported_models_base.BASE):
    }
    latent_format = latent_formats.SD15
    memory_usage_factor = 1.0
    def process_clip_state_dict(self, state_dict):
        k = list(state_dict.keys())
@@ -75,6 +78,7 @@ class SD20(supported_models_base.BASE):
    }
    latent_format = latent_formats.SD15
    memory_usage_factor = 1.0
    def model_type(self, state_dict, prefix=""):
        if self.unet_config["in_channels"] == 4: #SD2.0 inpainting models are not v prediction
@@ -100,7 +104,7 @@ class SD20(supported_models_base.BASE):
        return state_dict
    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sd2_clip.SD2Tokenizer, sd2_clip.SD2ClipModel)
+        return supported_models_base.ClipTarget(comfy.text_encoders.sd2_clip.SD2Tokenizer, comfy.text_encoders.sd2_clip.SD2ClipModel)
 class SD21UnclipL(SD20):
    unet_config = {
@@ -138,6 +142,7 @@ class SDXLRefiner(supported_models_base.BASE):
    }
    latent_format = latent_formats.SDXL
    memory_usage_factor = 1.0
    def get_model(self, state_dict, prefix="", device=None):
        return model_base.SDXLRefiner(self, device=device)
@@ -176,6 +181,8 @@ class SDXL(supported_models_base.BASE):
    latent_format = latent_formats.SDXL
    memory_usage_factor = 0.8
    def model_type(self, state_dict, prefix=""):
        if 'edm_mean' in state_dict and 'edm_std' in state_dict: #Playground V2.5
            self.latent_format = latent_formats.SDXL_Playground_2_5()
@@ -503,6 +510,9 @@ class SD3(supported_models_base.BASE):
    unet_extra_config = {}
    latent_format = latent_formats.SD3
    memory_usage_factor = 1.2
    text_encoder_key_prefix = ["text_encoders."]
    def get_model(self, state_dict, prefix="", device=None):
@@ -519,12 +529,11 @@ class SD3(supported_models_base.BASE):
            clip_l = True
        if "{}clip_g.transformer.text_model.final_layer_norm.weight".format(pref) in state_dict:
            clip_g = True
-        t5_key = "{}t5xxl.transformer.encoder.final_layer_norm.weight".format(pref)
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        if t5_key in state_dict:
+        if "dtype_t5" in t5_detect:
            t5 = True
            dtype_t5 = state_dict[t5_key].dtype
-        return supported_models_base.ClipTarget(sd3_clip.SD3Tokenizer, sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5))
+        return supported_models_base.ClipTarget(comfy.text_encoders.sd3_clip.SD3Tokenizer, comfy.text_encoders.sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, **t5_detect))
 class StableAudio(supported_models_base.BASE):
    unet_config = {
@@ -555,7 +564,7 @@ class StableAudio(supported_models_base.BASE):
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sa_t5.SAT5Tokenizer, sa_t5.SAT5Model)
+        return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model)
 class AuraFlow(supported_models_base.BASE):
    unet_config = {
@@ -580,6 +589,88 @@ class AuraFlow(supported_models_base.BASE):
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.aura_t5.AuraT5Tokenizer, comfy.text_encoders.aura_t5.AuraT5Model)
-models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow]
+class HunyuanDiT(supported_models_base.BASE):
    unet_config = {
        "image_model": "hydit",
    }
    unet_extra_config = {
        "attn_precision": torch.float32,
    }
    sampling_settings = {
        "linear_start": 0.00085,
        "linear_end": 0.018,
    }
    latent_format = latent_formats.SDXL
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HunyuanDiT(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.hydit.HyditTokenizer, comfy.text_encoders.hydit.HyditModel)
 class HunyuanDiT1(HunyuanDiT):
    unet_config = {
        "image_model": "hydit1",
    }
    unet_extra_config = {}
    sampling_settings = {
        "linear_start" : 0.00085,
        "linear_end" : 0.03,
    }
 class Flux(supported_models_base.BASE):
    unet_config = {
        "image_model": "flux",
        "guidance_embed": True,
    }
    sampling_settings = {
    }
    unet_extra_config = {}
    latent_format = latent_formats.Flux
    memory_usage_factor = 2.8
    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Flux(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
 class FluxSchnell(Flux):
    unet_config = {
        "image_model": "flux",
        "guidance_embed": False,
    }
    sampling_settings = {
        "multiplier": 1.0,
        "shift": 1.0,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device)
        return out
 models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, HunyuanDiT, HunyuanDiT1, Flux, FluxSchnell]
 models += [SVD_img2vid]
--- a/comfy/supported_models_base.py
+++ b/comfy/supported_models_base.py
@@ -1,3 +1,21 @@
 """
    This file is part of ComfyUI.
    Copyright (C) 2024 Comfy
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 import torch
 from . import model_base
 from . import utils
@@ -27,7 +45,12 @@ class BASE:
    text_encoder_key_prefix = ["cond_stage_model."]
    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
    memory_usage_factor = 2.0
    manual_cast_dtype = None
    custom_operations = None
    scaled_fp8 = None
    optimizations = {"fp8": False}
    @classmethod
    def matches(s, unet_config, state_dict=None):
@@ -50,6 +73,7 @@ class BASE:
        self.unet_config = unet_config.copy()
        self.sampling_settings = self.sampling_settings.copy()
        self.latent_format = self.latent_format()
        self.optimizations = self.optimizations.copy()
        for x in self.unet_extra_config:
            self.unet_config[x] = self.unet_extra_config[x]
--- a/comfy/text_encoders/aura_t5.py
+++ b/comfy/text_encoders/aura_t5.py
@@ -1,22 +1,22 @@
 from comfy import sd1_clip
-from .llama_tokenizer import LLAMATokenizer
+from .spiece_tokenizer import SPieceTokenizer
-import comfy.t5
+import comfy.text_encoders.t5
 import os
 class PT5XlModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_config_xl.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.t5.T5, enable_attention_masks=True, zero_out_masked=True)
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True, model_options=model_options)
 class PT5XlTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_tokenizer"), "tokenizer.model")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=LLAMATokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)
 class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, clip_name="pile_t5xl", tokenizer=PT5XlTokenizer)
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="pile_t5xl", tokenizer=PT5XlTokenizer)
 class AuraT5Model(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, **kwargs):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
-        super().__init__(device=device, dtype=dtype, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
+        super().__init__(device=device, dtype=dtype, model_options=model_options, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -0,0 +1,140 @@
 import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.ops
 class BertAttention(torch.nn.Module):
    def __init__(self, embed_dim, heads, dtype, device, operations):
        super().__init__()
        self.heads = heads
        self.query = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
        self.key = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
        self.value = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
    def forward(self, x, mask=None, optimized_attention=None):
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        out = optimized_attention(q, k, v, self.heads, mask)
        return out
 class BertOutput(torch.nn.Module):
    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
        self.LayerNorm = operations.LayerNorm(output_dim, eps=layer_norm_eps, dtype=dtype, device=device)
        # self.dropout = nn.Dropout(0.0)
    def forward(self, x, y):
        x = self.dense(x)
        # hidden_states = self.dropout(hidden_states)
        x = self.LayerNorm(x + y)
        return x
 class BertAttentionBlock(torch.nn.Module):
    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.self = BertAttention(embed_dim, heads, dtype, device, operations)
        self.output = BertOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
    def forward(self, x, mask, optimized_attention):
        y = self.self(x, mask, optimized_attention)
        return self.output(y, x)
 class BertIntermediate(torch.nn.Module):
    def __init__(self, embed_dim, intermediate_dim, dtype, device, operations):
        super().__init__()
        self.dense = operations.Linear(embed_dim, intermediate_dim, dtype=dtype, device=device)
    def forward(self, x):
        x = self.dense(x)
        return torch.nn.functional.gelu(x)
 class BertBlock(torch.nn.Module):
    def __init__(self, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.attention = BertAttentionBlock(embed_dim, heads, layer_norm_eps, dtype, device, operations)
        self.intermediate = BertIntermediate(embed_dim, intermediate_dim, dtype, device, operations)
        self.output = BertOutput(intermediate_dim, embed_dim, layer_norm_eps, dtype, device, operations)
    def forward(self, x, mask, optimized_attention):
        x = self.attention(x, mask, optimized_attention)
        y = self.intermediate(x)
        return self.output(y, x)
 class BertEncoder(torch.nn.Module):
    def __init__(self, num_layers, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.layer = torch.nn.ModuleList([BertBlock(embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations) for i in range(num_layers)])
    def forward(self, x, mask=None, intermediate_output=None):
        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
        if intermediate_output is not None:
            if intermediate_output < 0:
                intermediate_output = len(self.layer) + intermediate_output
        intermediate = None
        for i, l in enumerate(self.layer):
            x = l(x, mask, optimized_attention)
            if i == intermediate_output:
                intermediate = x.clone()
        return x, intermediate
 class BertEmbeddings(torch.nn.Module):
    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size, pad_token_id, embed_dim, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.word_embeddings = operations.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id, dtype=dtype, device=device)
        self.position_embeddings = operations.Embedding(max_position_embeddings, embed_dim, dtype=dtype, device=device)
        self.token_type_embeddings = operations.Embedding(type_vocab_size, embed_dim, dtype=dtype, device=device)
        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)
    def forward(self, input_tokens, token_type_ids=None, dtype=None):
        x = self.word_embeddings(input_tokens, out_dtype=dtype)
        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
        if token_type_ids is not None:
            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
        else:
            x += comfy.ops.cast_to_input(self.token_type_embeddings.weight[0], x)
        x = self.LayerNorm(x)
        return x
 class BertModel_(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        embed_dim = config_dict["hidden_size"]
        layer_norm_eps = config_dict["layer_norm_eps"]
        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)
    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
        x, i = self.encoder(x, mask, intermediate_output)
        return x, i
 class BertModel(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        self.bert = BertModel_(config_dict, dtype, device, operations)
        self.num_layers = config_dict["num_hidden_layers"]
    def get_input_embeddings(self):
        return self.bert.embeddings.word_embeddings
    def set_input_embeddings(self, embeddings):
        self.bert.embeddings.word_embeddings = embeddings
    def forward(self, *args, **kwargs):
        return self.bert(*args, **kwargs)
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -0,0 +1,72 @@
 from comfy import sd1_clip
 import comfy.text_encoders.t5
 import comfy.text_encoders.sd3_clip
 import comfy.model_management
 from transformers import T5TokenizerFast
 import torch
 import os
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
 class FluxTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
        return out
    def untokenize(self, token_weight_pair):
        return self.clip_l.untokenize(token_weight_pair)
    def state_dict(self):
        return {}
 class FluxClipModel(torch.nn.Module):
    def __init__(self, dtype_t5=None, device="cpu", dtype=None, model_options={}):
        super().__init__()
        dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
        self.t5xxl = comfy.text_encoders.sd3_clip.T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options)
        self.dtypes = set([dtype, dtype_t5])
    def set_clip_options(self, options):
        self.clip_l.set_clip_options(options)
        self.t5xxl.set_clip_options(options)
    def reset_clip_options(self):
        self.clip_l.reset_clip_options()
        self.t5xxl.reset_clip_options()
    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs_l = token_weight_pairs["l"]
        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
        t5_out, t5_pooled = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
        return t5_out, l_pooled
    def load_sd(self, sd):
        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
            return self.clip_l.load_sd(sd)
        else:
            return self.t5xxl.load_sd(sd)
 def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None):
    class FluxClipModel_(FluxClipModel):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
                model_options = model_options.copy()
                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
            super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
    return FluxClipModel_
--- a/comfy/text_encoders/hydit.py
+++ b/comfy/text_encoders/hydit.py
@@ -0,0 +1,79 @@
 from comfy import sd1_clip
 from transformers import BertTokenizer
 from .spiece_tokenizer import SPieceTokenizer
 from .bert import BertModel
 import comfy.text_encoders.t5
 import os
 import torch
 class HyditBertModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip.json")
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 101, "end": 102, "pad": 0}, model_class=BertModel, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
 class HyditBertTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip_tokenizer")
        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='chinese_roberta', tokenizer_class=BertTokenizer, pad_to_max_length=False, max_length=512, min_length=77)
 class MT5XLModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_config_xl.json")
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
 class MT5XLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        #tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_tokenizer"), "spiece.model")
        tokenizer = tokenizer_data.get("spiece_model", None)
        super().__init__(tokenizer, pad_with_end=False, embedding_size=2048, embedding_key='mt5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
 class HyditTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        mt5_tokenizer_data = tokenizer_data.get("mt5xl.spiece_model", None)
        self.hydit_clip = HyditBertTokenizer(embedding_directory=embedding_directory)
        self.mt5xl = MT5XLTokenizer(tokenizer_data={"spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)
    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["hydit_clip"] = self.hydit_clip.tokenize_with_weights(text, return_word_ids)
        out["mt5xl"] = self.mt5xl.tokenize_with_weights(text, return_word_ids)
        return out
    def untokenize(self, token_weight_pair):
        return self.hydit_clip.untokenize(token_weight_pair)
    def state_dict(self):
        return {"mt5xl.spiece_model": self.mt5xl.state_dict()["spiece_model"]}
 class HyditModel(torch.nn.Module):
    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__()
        self.hydit_clip = HyditBertModel(dtype=dtype, model_options=model_options)
        self.mt5xl = MT5XLModel(dtype=dtype, model_options=model_options)
        self.dtypes = set()
        if dtype is not None:
            self.dtypes.add(dtype)
    def encode_token_weights(self, token_weight_pairs):
        hydit_out = self.hydit_clip.encode_token_weights(token_weight_pairs["hydit_clip"])
        mt5_out = self.mt5xl.encode_token_weights(token_weight_pairs["mt5xl"])
        return hydit_out[0], hydit_out[1], {"attention_mask": hydit_out[2]["attention_mask"], "conditioning_mt5xl": mt5_out[0], "attention_mask_mt5xl": mt5_out[2]["attention_mask"]}
    def load_sd(self, sd):
        if "bert.encoder.layer.0.attention.self.query.weight" in sd:
            return self.hydit_clip.load_sd(sd)
        else:
            return self.mt5xl.load_sd(sd)
    def set_clip_options(self, options):
        self.hydit_clip.set_clip_options(options)
        self.mt5xl.set_clip_options(options)
    def reset_clip_options(self):
        self.hydit_clip.reset_clip_options()
        self.mt5xl.reset_clip_options()
--- a/comfy/text_encoders/hydit_clip.json
+++ b/comfy/text_encoders/hydit_clip.json
@@ -0,0 +1,35 @@
 {
  "_name_or_path": "hfl/chinese-roberta-wwm-ext-large",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 47020
 }
--- a/comfy/text_encoders/hydit_clip_tokenizer/special_tokens_map.json
+++ b/comfy/text_encoders/hydit_clip_tokenizer/special_tokens_map.json
@@ -0,0 +1,7 @@
 {
  "cls_token": "[CLS]",
  "mask_token": "[MASK]",
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "unk_token": "[UNK]"
 }
--- a/comfy/text_encoders/hydit_clip_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/hydit_clip_tokenizer/tokenizer_config.json
@@ -0,0 +1,16 @@
 {
  "cls_token": "[CLS]",
  "do_basic_tokenize": true,
  "do_lower_case": true,
  "mask_token": "[MASK]",
  "name_or_path": "hfl/chinese-roberta-wwm-ext",
  "never_split": null,
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "special_tokens_map_file": "/home/chenweifeng/.cache/huggingface/hub/models--hfl--chinese-roberta-wwm-ext/snapshots/5c58d0b8ec1d9014354d691c538661bf00bfdb44/special_tokens_map.json",
  "strip_accents": null,
  "tokenize_chinese_chars": true,
  "tokenizer_class": "BertTokenizer",
  "unk_token": "[UNK]",
  "model_max_length": 77
 }
--- a/comfy/text_encoders/hydit_clip_tokenizer/vocab.txt
+++ b/comfy/text_encoders/hydit_clip_tokenizer/vocab.txt
--- a/comfy/text_encoders/llama_tokenizer.py
+++ b/comfy/text_encoders/llama_tokenizer.py
@@ -1,22 +0,0 @@
 import os
 class LLAMATokenizer:
    @staticmethod
    def from_pretrained(path):
        return LLAMATokenizer(path)
    def __init__(self, tokenizer_path):
        import sentencepiece
        self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path)
        self.end = self.tokenizer.eos_id()
    def get_vocab(self):
        out = {}
        for i in range(self.tokenizer.get_piece_size()):
            out[self.tokenizer.id_to_piece(i)] = i
        return out
    def __call__(self, string):
        out = self.tokenizer.encode(string)
        out += [self.end]
        return {"input_ids": out}
--- a/comfy/text_encoders/long_clipl.json
+++ b/comfy/text_encoders/long_clipl.json
@@ -0,0 +1,25 @@
 {
  "_name_or_path": "openai/clip-vit-large-patch14",
  "architectures": [
    "CLIPTextModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
  "eos_token_id": 49407,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 248,
  "model_type": "clip_text_model",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "projection_dim": 768,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 49408
 }
--- a/comfy/text_encoders/long_clipl.py
+++ b/comfy/text_encoders/long_clipl.py
@@ -0,0 +1,30 @@
 from comfy import sd1_clip
 import os
 class LongClipTokenizer_(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(max_length=248, embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
 class LongClipModel_(sd1_clip.SDClipModel):
    def __init__(self, *args, **kwargs):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "long_clipl.json")
        super().__init__(*args, textmodel_json_config=textmodel_json_config, **kwargs)
 class LongClipTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, tokenizer=LongClipTokenizer_)
 class LongClipModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_model=LongClipModel_, **kwargs)
 def model_options_long_clip(sd, tokenizer_data, model_options):
    w = sd.get("clip_l.text_model.embeddings.position_embedding.weight", None)
    if w is None:
        w = sd.get("text_model.embeddings.position_embedding.weight", None)
    if w is not None and w.shape[0] == 248:
        tokenizer_data = tokenizer_data.copy()
        model_options = model_options.copy()
        tokenizer_data["clip_l_tokenizer_class"] = LongClipTokenizer_
        model_options["clip_l_class"] = LongClipModel_
    return tokenizer_data, model_options
--- a/comfy/text_encoders/mt5_config_xl.json
+++ b/comfy/text_encoders/mt5_config_xl.json
@@ -0,0 +1,22 @@
 {
  "d_ff": 5120,
  "d_kv": 64,
  "d_model": 2048,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "dense_act_fn": "gelu_pytorch_tanh",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 24,
  "num_heads": 32,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "vocab_size": 250112
 }
--- a/comfy/text_encoders/sa_t5.py
+++ b/comfy/text_encoders/sa_t5.py
@@ -1,22 +1,22 @@
 from comfy import sd1_clip
 from transformers import T5TokenizerFast
-import comfy.t5
+import comfy.text_encoders.t5
 import os
 class T5BaseModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_base.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.t5.T5, enable_attention_masks=True, zero_out_masked=True)
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, model_options=model_options, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True)
 class T5BaseTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=768, embedding_key='t5base', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128)
 class SAT5Tokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, clip_name="t5base", tokenizer=T5BaseTokenizer)
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5base", tokenizer=T5BaseTokenizer)
 class SAT5Model(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, **kwargs):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
-        super().__init__(device=device, dtype=dtype, name="t5base", clip_model=T5BaseModel, **kwargs)
+        super().__init__(device=device, dtype=dtype, model_options=model_options, name="t5base", clip_model=T5BaseModel, **kwargs)
--- a/comfy/text_encoders/sd2_clip.py
+++ b/comfy/text_encoders/sd2_clip.py
@@ -2,22 +2,22 @@ from comfy import sd1_clip
 import os
 class SD2ClipHModel(sd1_clip.SDClipModel):
-    def __init__(self, arch="ViT-H-14", device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None):
+    def __init__(self, arch="ViT-H-14", device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None, model_options={}):
        if layer == "penultimate":
            layer="hidden"
            layer_idx=-2
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd2_clip_config.json")
-        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0})
+        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0}, return_projected_pooled=True, model_options=model_options)
 class SD2ClipHTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, tokenizer_path=None, embedding_directory=None):
+    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024)
 class SD2Tokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, clip_name="h", tokenizer=SD2ClipHTokenizer)
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="h", tokenizer=SD2ClipHTokenizer)
 class SD2ClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, **kwargs):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
-        super().__init__(device=device, dtype=dtype, clip_name="h", clip_model=SD2ClipHModel, **kwargs)
+        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_name="h", clip_model=SD2ClipHModel, **kwargs)
--- a/comfy/text_encoders/sd2_clip_config.json
+++ b/comfy/text_encoders/sd2_clip_config.json
@@ -5,7 +5,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 2,
+  "eos_token_id": 49407,
  "hidden_act": "gelu",
  "hidden_size": 1024,
  "initializer_factor": 1.0,
--- a/comfy/text_encoders/sd3_clip.py
+++ b/comfy/text_encoders/sd3_clip.py
@@ -1,35 +1,45 @@
 from comfy import sd1_clip
 from comfy import sdxl_clip
 from transformers import T5TokenizerFast
-import comfy.t5
+import comfy.text_encoders.t5
 import torch
 import os
 import comfy.model_management
 import logging
 class T5XXLModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=False, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.t5.T5)
+        t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None)
        if t5xxl_scaled_fp8 is not None:
            model_options = model_options.copy()
            model_options["scaled_fp8"] = t5xxl_scaled_fp8
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
 def t5_xxl_detect(state_dict, prefix=""):
    out = {}
    t5_key = "{}encoder.final_layer_norm.weight".format(prefix)
    if t5_key in state_dict:
        out["dtype_t5"] = state_dict[t5_key].dtype
    scaled_fp8_key = "{}scaled_fp8".format(prefix)
    if scaled_fp8_key in state_dict:
        out["t5xxl_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
    return out
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
 class SDT5XXLTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None):
        super().__init__(embedding_directory=embedding_directory, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
 class SDT5XXLModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, **kwargs):
        super().__init__(device=device, dtype=dtype, clip_name="t5xxl", clip_model=T5XXLModel, **kwargs)
 class SD3Tokenizer:
-    def __init__(self, embedding_directory=None):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory)
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory)
        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
@@ -43,32 +53,30 @@ class SD3Tokenizer:
    def untokenize(self, token_weight_pair):
        return self.clip_g.untokenize(token_weight_pair)
    def state_dict(self):
        return {}
 class SD3ClipModel(torch.nn.Module):
-    def __init__(self, clip_l=True, clip_g=True, t5=True, dtype_t5=None, device="cpu", dtype=None):
+    def __init__(self, clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5_attention_mask=False, device="cpu", dtype=None, model_options={}):
        super().__init__()
        self.dtypes = set()
        if clip_l:
-            self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, return_projected_pooled=False)
+            clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
            self.clip_l = clip_l_class(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, return_projected_pooled=False, model_options=model_options)
            self.dtypes.add(dtype)
        else:
            self.clip_l = None
        if clip_g:
-            self.clip_g = sdxl_clip.SDXLClipG(device=device, dtype=dtype)
+            self.clip_g = sdxl_clip.SDXLClipG(device=device, dtype=dtype, model_options=model_options)
            self.dtypes.add(dtype)
        else:
            self.clip_g = None
        if t5:
-            if dtype_t5 is None:
+            dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
-                dtype_t5 = dtype
+            self.t5_attention_mask = t5_attention_mask
-            elif comfy.model_management.dtype_size(dtype_t5) > comfy.model_management.dtype_size(dtype):
+            self.t5xxl = T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options, attention_mask=self.t5_attention_mask)
                dtype_t5 = dtype
            if not comfy.model_management.supports_cast(device, dtype_t5):
                dtype_t5 = dtype
            self.t5xxl = T5XXLModel(device=device, dtype=dtype_t5)
            self.dtypes.add(dtype_t5)
        else:
            self.t5xxl = None
@@ -94,10 +102,11 @@ class SD3ClipModel(torch.nn.Module):
    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs_l = token_weight_pairs["l"]
        token_weight_pairs_g = token_weight_pairs["g"]
-        token_weight_pars_t5 = token_weight_pairs["t5xxl"]
+        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
        lg_out = None
        pooled = None
        out = None
        extra = {}
        if len(token_weight_pairs_g) > 0 or len(token_weight_pairs_l) > 0:
            if self.clip_l is not None:
@@ -108,7 +117,8 @@ class SD3ClipModel(torch.nn.Module):
            if self.clip_g is not None:
                g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
                if lg_out is not None:
-                    lg_out = torch.cat([lg_out, g_out], dim=-1)
+                    cut_to = min(lg_out.shape[1], g_out.shape[1])
                    lg_out = torch.cat([lg_out[:,:cut_to], g_out[:,:cut_to]], dim=-1)
                else:
                    lg_out = torch.nn.functional.pad(g_out, (768, 0))
            else:
@@ -121,7 +131,11 @@ class SD3ClipModel(torch.nn.Module):
            pooled = torch.cat((l_pooled, g_pooled), dim=-1)
        if self.t5xxl is not None:
-            t5_out, t5_pooled = self.t5xxl.encode_token_weights(token_weight_pars_t5)
+            t5_output = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
            t5_out, t5_pooled = t5_output[:2]
            if self.t5_attention_mask:
                extra["attention_mask"] = t5_output[2]["attention_mask"]
            if lg_out is not None:
                out = torch.cat([lg_out, t5_out], dim=-2)
            else:
@@ -133,7 +147,7 @@ class SD3ClipModel(torch.nn.Module):
        if pooled is None:
            pooled = torch.zeros((1, 768 + 1280), device=comfy.model_management.intermediate_device())
-        return out, pooled
+        return out, pooled, extra
    def load_sd(self, sd):
        if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@@ -143,8 +157,11 @@ class SD3ClipModel(torch.nn.Module):
        else:
            return self.t5xxl.load_sd(sd)
-def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None):
+def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5xxl_scaled_fp8=None, t5_attention_mask=False):
    class SD3ClipModel_(SD3ClipModel):
-        def __init__(self, device="cpu", dtype=None):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
-            super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, device=device, dtype=dtype)
+            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
                model_options = model_options.copy()
                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
            super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, t5_attention_mask=t5_attention_mask, device=device, dtype=dtype, model_options=model_options)
    return SD3ClipModel_
--- a/comfy/text_encoders/spiece_tokenizer.py
+++ b/comfy/text_encoders/spiece_tokenizer.py
@@ -0,0 +1,32 @@
 import os
 import torch
 class SPieceTokenizer:
    add_eos = True
    @staticmethod
    def from_pretrained(path):
        return SPieceTokenizer(path)
    def __init__(self, tokenizer_path):
        import sentencepiece
        if torch.is_tensor(tokenizer_path):
            tokenizer_path = tokenizer_path.numpy().tobytes()
        if isinstance(tokenizer_path, bytes):
            self.tokenizer = sentencepiece.SentencePieceProcessor(model_proto=tokenizer_path, add_eos=self.add_eos)
        else:
            self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path, add_eos=self.add_eos)
    def get_vocab(self):
        out = {}
        for i in range(self.tokenizer.get_piece_size()):
            out[self.tokenizer.id_to_piece(i)] = i
        return out
    def __call__(self, string):
        out = self.tokenizer.encode(string)
        return {"input_ids": out}
    def serialize_model(self):
        return torch.ByteTensor(list(self.tokenizer.serialized_model_proto()))
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@@ -1,6 +1,7 @@
 import torch
 import math
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.ops
 class T5LayerNorm(torch.nn.Module):
    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None, operations=None):
@@ -11,7 +12,7 @@ class T5LayerNorm(torch.nn.Module):
    def forward(self, x):
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight.to(device=x.device, dtype=x.dtype) * x
+        return comfy.ops.cast_to_input(self.weight, x) * x
 activations = {
    "gelu_pytorch_tanh": lambda a: torch.nn.functional.gelu(a, approximate="tanh"),
@@ -82,7 +83,7 @@ class T5Attention(torch.nn.Module):
        if relative_attention_bias:
            self.relative_attention_num_buckets = 32
            self.relative_attention_max_distance = 128
-            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)
+            self.relative_attention_bias = operations.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device, dtype=dtype)
    @staticmethod
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
@@ -132,7 +133,7 @@ class T5Attention(torch.nn.Module):
        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
        return relative_buckets
-    def compute_bias(self, query_length, key_length, device):
+    def compute_bias(self, query_length, key_length, device, dtype):
        """Compute binned relative position bias"""
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
@@ -143,7 +144,7 @@ class T5Attention(torch.nn.Module):
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
-        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = self.relative_attention_bias(relative_position_bucket, out_dtype=dtype)  # shape (query_length, key_length, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
        return values
@@ -152,7 +153,7 @@ class T5Attention(torch.nn.Module):
        k = self.k(x)
        v = self.v(x)
        if self.relative_attention_bias is not None:
-            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)
+            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device, x.dtype)
        if past_bias is not None:
            if mask is not None:
@@ -199,7 +200,7 @@ class T5Stack(torch.nn.Module):
        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
        # self.dropout = nn.Dropout(config.dropout_rate)
-    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -223,9 +224,9 @@ class T5(torch.nn.Module):
        self.num_layers = config_dict["num_layers"]
        model_dim = config_dict["d_model"]
-        self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["is_gated_act"], config_dict["num_heads"], config_dict["model_type"] == "t5", dtype, device, operations)
+        self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["is_gated_act"], config_dict["num_heads"], config_dict["model_type"] != "umt5", dtype, device, operations)
        self.dtype = dtype
-        self.shared = torch.nn.Embedding(config_dict["vocab_size"], model_dim, device=device)
+        self.shared = operations.Embedding(config_dict["vocab_size"], model_dim, device=device, dtype=dtype)
    def get_input_embeddings(self):
        return self.shared
@@ -234,5 +235,7 @@ class T5(torch.nn.Module):
        self.shared = embeddings
    def forward(self, input_ids, *args, **kwargs):
-        x = self.shared(input_ids)
+        x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
        if self.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
            x = torch.nan_to_num(x) #Fix for fp8 T5 base
        return self.encoder(x, *args, **kwargs)
--- a/comfy/text_encoders/t5_config_base.json
+++ b/comfy/text_encoders/t5_config_base.json
--- a/comfy/text_encoders/t5_config_xxl.json
+++ b/comfy/text_encoders/t5_config_xxl.json
--- a/comfy/text_encoders/t5_tokenizer/special_tokens_map.json
+++ b/comfy/text_encoders/t5_tokenizer/special_tokens_map.json
--- a/comfy/text_encoders/t5_tokenizer/tokenizer.json
+++ b/comfy/text_encoders/t5_tokenizer/tokenizer.json
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast`
							`pause`
		`@@ -0,0 +1,2 @@`
							`/web/assets/** linguist-generated`
							`/web/** linguist-vendored`
		`@@ -0,0 +1,3 @@`
							`# ComfyUI Internal Routes`

							All routes under the `/internal` path are designated for internal use by ComfyUI only. These routes are not intended for use by external applications may change at any time without notice.