Set torch version to be 2.3.1 for v0.0.3

2024-07-26 18:54:29 -07:00
221 changed files with 43688 additions and 161594 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -62,38 +62,12 @@ except:

 print("checking out master branch")
 branch = repo.lookup_branch('master')
-if branch is None:
-    ref = repo.lookup_reference('refs/remotes/origin/master')
-    repo.checkout(ref)
-    branch = repo.lookup_branch('master')
-    if branch is None:
-        repo.create_branch('master', repo.get(ref.target))
-else:
-    ref = repo.lookup_reference(branch.name)
-    repo.checkout(ref)
+ref = repo.lookup_reference(branch.name)
+repo.checkout(ref)

 print("pulling latest changes")
 pull(repo)

-if "--stable" in sys.argv:
-    def latest_tag(repo):
-        versions = []
-        for k in repo.references:
-            try:
-                prefix = "refs/tags/v"
-                if k.startswith(prefix):
-                    version = list(map(int, k[len(prefix):].split(".")))
-                    versions.append((version[0] * 10000000000 + version[1] * 100000 + version[2], k))
-            except:
-                pass
-        versions.sort()
-        if len(versions) > 0:
-            return versions[-1][1]
-        return None
-    latest_tag = latest_tag(repo)
-    if latest_tag is not None:
-        repo.checkout(latest_tag)
-
 print("Done!")

 self_update = True
@@ -134,13 +108,3 @@ if not os.path.exists(req_path) or not files_equal(repo_req_path, req_path):
        shutil.copy(repo_req_path, req_path)
    except:
        pass
-
-
-stable_update_script = os.path.join(repo_path, ".ci/update_windows/update_comfyui_stable.bat")
-stable_update_script_to = os.path.join(cur_path, "update_comfyui_stable.bat")
-
-try:
-    if not file_size(stable_update_script_to) > 10:
-        shutil.copy(stable_update_script, stable_update_script_to)
-except:
-    pass
--- a/.ci/update_windows/update_comfyui_stable.bat
+++ b/.ci/update_windows/update_comfyui_stable.bat
@@ -1,8 +0,0 @@
-@echo off
-..\python_embeded\python.exe .\update.py ..\ComfyUI\ --stable
-if exist update_new.py (
-  move /y update_new.py update.py
-  echo Running updater again since it got updated.
-  ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --skip_self_update --stable
-)
-if "%~1"=="" pause
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
@@ -1,2 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast
-pause
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +0,0 @@
-/web/assets/** linguist-generated
-/web/** linguist-vendored
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,8 +1,5 @@
 blank_issues_enabled: true
 contact_links:
-  - name: ComfyUI Frontend Issues
-    url: https://github.com/Comfy-Org/ComfyUI_frontend/issues
-    about: Issues related to the ComfyUI frontend (display issues, user interaction bugs), please go to the frontend repo to file the issue
  - name: ComfyUI Matrix Space
    url: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
    about: The ComfyUI Matrix Space is available for support and general discussion related to ComfyUI (Matrix is like Discord but open source).
--- a/.github/workflows/pullrequest-ci-run.yml
+++ b/.github/workflows/pullrequest-ci-run.yml
@@ -1,53 +0,0 @@
-# This is the GitHub Workflow that drives full-GPU-enabled tests of pull requests to ComfyUI, when the 'Run-CI-Test' label is added
-# Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
-name: Pull Request CI Workflow Runs
-on:
-    pull_request_target:
-        types: [labeled]
-
-jobs:
-  pr-test-stable:
-    if: ${{ github.event.label.name == 'Run-CI-Test' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [macos, linux, windows]
-        python_version: ["3.9", "3.10", "3.11", "3.12"]
-        cuda_version: ["12.1"]
-        torch_version: ["stable"]
-        include:
-          - os: macos
-            runner_label: [self-hosted, macOS]
-            flags: "--use-pytorch-cross-attention"
-          - os: linux
-            runner_label: [self-hosted, Linux]
-            flags: ""
-          - os: windows
-            runner_label: [self-hosted, win]
-            flags: ""
-    runs-on: ${{ matrix.runner_label }}
-    steps:
-      - name: Test Workflows
-        uses: comfy-org/comfy-action@main
-        with:
-          os: ${{ matrix.os }}
-          python_version: ${{ matrix.python_version }}
-          torch_version: ${{ matrix.torch_version }}
-          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
-          comfyui_flags: ${{ matrix.flags }}
-          use_prior_commit: 'true'
-  comment:
-    if: ${{ github.event.label.name == 'Run-CI-Test' }}
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: write
-    steps:
-      - uses: actions/github-script@v6
-        with:
-          script: |
-            github.rest.issues.createComment({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              body: '(Automated Bot Message) CI Tests are running, you can view the results at https://ci.comfy.org/?branch=${{ github.event.pull_request.number }}%2Fmerge'
-            })
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -2,28 +2,9 @@
 name: "Release Stable Version"

 on:
-  workflow_dispatch:
-    inputs:
-      git_tag:
-        description: 'Git tag'
-        required: true
-        type: string
-      cu:
-        description: 'CUDA version'
-        required: true
-        type: string
-        default: "121"
-      python_minor:
-        description: 'Python minor version'
-        required: true
-        type: string
-        default: "11"
-      python_patch:
-        description: 'Python patch version'
-        required: true
-        type: string
-        default: "9"
-
+  push:
+    tags:
+      - 'v*'

 jobs:
  package_comfy_windows:
@@ -32,44 +13,69 @@ jobs:
      packages: "write"
      pull-requests: "read"
    runs-on: windows-latest
+    strategy:
+      matrix:
+        python_version: [3.11.8]
+        cuda_version: [121]
    steps:
+      - name: Calculate Minor Version
+        shell: bash
+        run: |
+          # Extract the minor version from the Python version
+          MINOR_VERSION=$(echo "${{ matrix.python_version }}" | cut -d'.' -f2)
+          echo "MINOR_VERSION=$MINOR_VERSION" >> $GITHUB_ENV
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+        
      - uses: actions/checkout@v4
        with:
-          ref: ${{ inputs.git_tag }}
          fetch-depth: 0
          persist-credentials: false
-      - uses: actions/cache/restore@v4
-        id: cache
-        with:
-          path: |
-            cu${{ inputs.cu }}_python_deps.tar
-            update_comfyui_and_python_dependencies.bat
-          key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
      - shell: bash
        run: |
-          mv cu${{ inputs.cu }}_python_deps.tar ../
+          echo "@echo off
+          call update_comfyui.bat nopause
+          echo -
+          echo This will try to update pytorch and all python dependencies.
+          echo -
+          echo If you just want to update normally, close this and run update_comfyui.bat instead.
+          echo -
+          pause
+          ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r ../ComfyUI/requirements.txt pygit2
+          pause" > update_comfyui_and_python_dependencies.bat
+
+          python -m pip wheel --no-cache-dir torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+          python -m pip install --no-cache-dir ./temp_wheel_dir/*
+          echo installed basic
+          ls -lah temp_wheel_dir
+          mv temp_wheel_dir cu${{ matrix.cuda_version }}_python_deps
+          mv cu${{ matrix.cuda_version }}_python_deps ../
          mv update_comfyui_and_python_dependencies.bat ../
          cd ..
-          tar xf cu${{ inputs.cu }}_python_deps.tar
          pwd
          ls
-
-      - shell: bash
-        run: |
-          cd ..
+          
          cp -r ComfyUI ComfyUI_copy
-          curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
+          curl https://www.python.org/ftp/python/${{ matrix.python_version }}/python-${{ matrix.python_version }}-embed-amd64.zip -o python_embeded.zip
          unzip python_embeded.zip -d python_embeded
          cd python_embeded
          echo ${{ env.MINOR_VERSION }}
-          echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
+          echo 'import site' >> ./python3${{ env.MINOR_VERSION }}._pth
          curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
          ./python.exe get-pip.py
-          ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
-            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
-            cd ..
+          ./python.exe --version
+          echo "Pip version:"
+          ./python.exe -m pip --version

-          git clone --depth 1 https://github.com/comfyanonymous/taesd
+          set PATH=$PWD/Scripts:$PATH
+          echo $PATH
+          ./python.exe -s -m pip install ../cu${{ matrix.cuda_version }}_python_deps/*
+          sed -i '1i../ComfyUI' ./python3${{ env.MINOR_VERSION }}._pth
+          cd ..
+
+          git clone https://github.com/comfyanonymous/taesd
          cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

          mkdir ComfyUI_windows_portable
@@ -98,7 +104,6 @@ jobs:
        with:
          repo_token: ${{ secrets.GITHUB_TOKEN }}
          file: ComfyUI_windows_portable_nvidia.7z
-          tag: ${{ inputs.git_tag }}
+          tag: ${{ github.ref }}
          overwrite: true
-          prerelease: true
-          make_latest: false
+        
--- a/.github/workflows/test-browser.yml
+++ b/.github/workflows/test-browser.yml
@@ -0,0 +1,76 @@
+# This is a temporary action during frontend TS migration.
+# This file should be removed after TS migration is completed.
+# The browser test is here to ensure TS repo is working the same way as the
+# current JS code.
+# If you are adding UI feature, please sync your changes to the TS repo:
+# huchenlei/ComfyUI_frontend and update test expectation files accordingly.
+name: Playwright Browser Tests CI
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout ComfyUI
+      uses: actions/checkout@v4
+      with:
+        repository: "comfyanonymous/ComfyUI"
+        path: "ComfyUI"
+    - name: Checkout ComfyUI_frontend
+      uses: actions/checkout@v4
+      with:
+        repository: "huchenlei/ComfyUI_frontend"
+        path: "ComfyUI_frontend"
+        ref: "fcc54d803e5b6a9b08a462a1d94899318c96dcbb"
+    - uses: actions/setup-node@v3
+      with:
+        node-version: lts/*
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+        pip install wait-for-it
+      working-directory: ComfyUI
+    - name: Start ComfyUI server
+      run: |
+        python main.py --cpu 2>&1 | tee console_output.log &
+        wait-for-it --service 127.0.0.1:8188 -t 600
+      working-directory: ComfyUI
+    - name: Install ComfyUI_frontend dependencies
+      run: |
+        npm ci
+      working-directory: ComfyUI_frontend
+    - name: Install Playwright Browsers
+      run: npx playwright install --with-deps
+      working-directory: ComfyUI_frontend
+    - name: Run Playwright tests
+      run: npx playwright test
+      working-directory: ComfyUI_frontend
+    - name: Check for unhandled exceptions in server log
+      run: |
+        if grep -qE "Exception|Error" console_output.log; then
+          echo "Unhandled exception/error found in server log."
+          exit 1
+        fi
+      working-directory: ComfyUI
+    - uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: playwright-report
+        path: ComfyUI_frontend/playwright-report/
+        retention-days: 30
+    - uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: console-output
+        path: ComfyUI/console_output.log
+        retention-days: 30
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@@ -1,95 +0,0 @@
-# This is the GitHub Workflow that drives automatic full-GPU-enabled tests of all new commits to the master branch of ComfyUI
-# Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
-name: Full Comfy CI Workflow Runs
-on:
-  push:
-    branches:
-      - master
-    paths-ignore:
-      - 'app/**'
-      - 'input/**'
-      - 'output/**'
-      - 'notebooks/**'
-      - 'script_examples/**'
-      - '.github/**'
-      - 'web/**'
-  workflow_dispatch:
-
-jobs:
-  test-stable:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [macos, linux, windows]
-        python_version: ["3.9", "3.10", "3.11", "3.12"]
-        cuda_version: ["12.1"]
-        torch_version: ["stable"]
-        include:
-          - os: macos
-            runner_label: [self-hosted, macOS]
-            flags: "--use-pytorch-cross-attention"
-          - os: linux
-            runner_label: [self-hosted, Linux]
-            flags: ""
-          - os: windows
-            runner_label: [self-hosted, win]
-            flags: ""
-    runs-on: ${{ matrix.runner_label }}
-    steps:
-      - name: Test Workflows
-        uses: comfy-org/comfy-action@main
-        with:
-          os: ${{ matrix.os }}
-          python_version: ${{ matrix.python_version }}
-          torch_version: ${{ matrix.torch_version }}
-          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
-          comfyui_flags: ${{ matrix.flags }}
-
-  test-win-nightly:
-    strategy:
-      fail-fast: true
-      matrix:
-        os: [windows]
-        python_version: ["3.9", "3.10", "3.11", "3.12"]
-        cuda_version: ["12.1"]
-        torch_version: ["nightly"]
-        include:
-          - os: windows
-            runner_label: [self-hosted, win]
-            flags: ""
-    runs-on: ${{ matrix.runner_label }}
-    steps:
-      - name: Test Workflows
-        uses: comfy-org/comfy-action@main
-        with:
-          os: ${{ matrix.os }}
-          python_version: ${{ matrix.python_version }}
-          torch_version: ${{ matrix.torch_version }}
-          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
-          comfyui_flags: ${{ matrix.flags }}
-
-  test-unix-nightly:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [macos, linux]
-        python_version: ["3.11"]
-        cuda_version: ["12.1"]
-        torch_version: ["nightly"]
-        include:
-          - os: macos
-            runner_label: [self-hosted, macOS]
-            flags: "--use-pytorch-cross-attention"
-          - os: linux
-            runner_label: [self-hosted, Linux]
-            flags: ""
-    runs-on: ${{ matrix.runner_label }}
-    steps:
-      - name: Test Workflows
-        uses: comfy-org/comfy-action@main
-        with:
-          os: ${{ matrix.os }}
-          python_version: ${{ matrix.python_version }}
-          torch_version: ${{ matrix.torch_version }}
-          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
-          comfyui_flags: ${{ matrix.flags }}
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -1,45 +0,0 @@
-name: Test server launches without errors
-
-on:
-  push:
-    branches: [ main, master ]
-  pull_request:
-    branches: [ main, master ]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout ComfyUI
-      uses: actions/checkout@v4
-      with:
-        repository: "comfyanonymous/ComfyUI"
-        path: "ComfyUI"
-    - uses: actions/setup-python@v4
-      with:
-        python-version: '3.8'
-    - name: Install requirements
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install -r requirements.txt
-        pip install wait-for-it
-      working-directory: ComfyUI
-    - name: Start ComfyUI server
-      run: |
-        python main.py --cpu 2>&1 | tee console_output.log &
-        wait-for-it --service 127.0.0.1:8188 -t 600
-      working-directory: ComfyUI
-    - name: Check for unhandled exceptions in server log
-      run: |
-        if grep -qE "Exception|Error" console_output.log; then
-          echo "Unhandled exception/error found in server log."
-          exit 1
-        fi
-      working-directory: ComfyUI
-    - uses: actions/upload-artifact@v4
-      if: always()
-      with:
-        name: console-output
-        path: ComfyUI/console_output.log
-        retention-days: 30
--- a/.github/workflows/test-ui.yaml
+++ b/.github/workflows/test-ui.yaml
@@ -0,0 +1,30 @@
+name: Tests CI
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-node@v3
+      with:
+        node-version: 18
+    - uses: actions/setup-python@v4
+      with: 
+        python-version: '3.10'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+    - name: Run Tests
+      run: | 
+        npm ci
+        npm run test:generate
+        npm test -- --verbose
+      working-directory: ./tests-ui
+    - name: Run Unit Tests
+      run: |
+        pip install -r tests-unit/requirements.txt
+        python -m pytest tests-unit
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -8,16 +8,11 @@ on:
        required: false
        type: string
        default: ""
-      extra_dependencies:
-        description: 'extra dependencies'
-        required: false
-        type: string
-        default: "\"numpy<2\""
      cu:
        description: 'cuda version'
        required: true
        type: string
-        default: "124"
+        default: "121"

      python_minor:
        description: 'python minor version'
@@ -29,7 +24,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"
 #  push:
 #    branches:
 #      - master
@@ -56,7 +51,7 @@ jobs:
            ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
            pause" > update_comfyui_and_python_dependencies.bat

-            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
            python -m pip install --no-cache-dir ./temp_wheel_dir/*
            echo installed basic
            ls -lah temp_wheel_dir
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "4"
+        default: "3"
 #  push:
 #    branches:
 #      - master
@@ -49,13 +49,13 @@ jobs:
            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
            ./python.exe get-pip.py
-            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision torchaudio mpmath==1.3.0 numpy==1.26.4 --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
            ls ../temp_wheel_dir
            ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            cd ..

-            git clone --depth 1 https://github.com/comfyanonymous/taesd
+            git clone https://github.com/comfyanonymous/taesd
            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable_nightly_pytorch
@@ -67,7 +67,6 @@ jobs:
            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
            cp -r ComfyUI/.ci/windows_base_files/* ./
-            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./

            echo "call update_comfyui.bat nopause
            ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "124"
+        default: "121"

      python_minor:
        description: 'python minor version'
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"
 #  push:
 #    branches:
 #      - master
@@ -66,7 +66,7 @@ jobs:
            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            cd ..

-            git clone --depth 1 https://github.com/comfyanonymous/taesd
+            git clone https://github.com/comfyanonymous/taesd
            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable
--- a/.gitignore
+++ b/.gitignore
@@ -18,5 +18,4 @@ venv/
 /tests-ui/data/object_info.json
 /user/
 *.log
-web_custom_versions/
-.DS_Store
+web_custom_versions/
--- a/README.md
+++ b/README.md
@@ -1,35 +1,8 @@
-<div align="center">
-
-# ComfyUI
-**The most powerful and modular stable diffusion GUI and backend.**
-
-
-[![Website][website-shield]][website-url]
-[![Dynamic JSON Badge][discord-shield]][discord-url]
-[![Matrix][matrix-shield]][matrix-url]
-<br>
-[![][github-release-shield]][github-release-link]
-[![][github-release-date-shield]][github-release-link]
-[![][github-downloads-shield]][github-downloads-link]
-[![][github-downloads-latest-shield]][github-downloads-link]
-
-[matrix-shield]: https://img.shields.io/badge/Matrix-000000?style=flat&logo=matrix&logoColor=white
-[matrix-url]: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
-[website-shield]: https://img.shields.io/badge/ComfyOrg-4285F4?style=flat
-[website-url]: https://www.comfy.org/
-<!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
-[discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
-[discord-url]: https://www.comfy.org/discord
-
-[github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
-[github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
-[github-release-date-shield]: https://img.shields.io/github/release-date/comfyanonymous/ComfyUI?style=flat
-[github-downloads-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/total?style=flat
-[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
-[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
-
+ComfyUI
+=======
+The most powerful and modular stable diffusion GUI and backend.
+-----------
 ![ComfyUI Screenshot](comfyui_screenshot.png)
-</div>

 This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
 ### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
@@ -39,7 +12,6 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@@ -61,7 +33,6 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
 - [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
 - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
- [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
@@ -75,7 +46,6 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 |------------------------------------|--------------------------------------------------------------------------------------------------------------------|
 | Ctrl + Enter                       | Queue up current graph for generation                                                                              |
 | Ctrl + Shift + Enter               | Queue up current graph as first for generation                                                                     |
-| Ctrl + Alt + Enter                 | Cancel current generation                                                                                          |
 | Ctrl + Z/Ctrl + Y                  | Undo/Redo                                                                                                          |
 | Ctrl + S                           | Save workflow                                                                                                      |
 | Ctrl + O                           | Load workflow                                                                                                      |
@@ -98,8 +68,6 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 | H                                  | Toggle visibility of history                                                                                       |
 | R                                  | Refresh graph                                                                                                      |
 | Double-Click LMB                   | Open node quick search palette                                                                                     |
-| Shift + Drag                       | Move multiple wires at once                                                                                        |
-| Ctrl + Alt + LMB                   | Disconnect all wires from clicked slot                                                                             |

 Ctrl can also be replaced with Cmd instead for macOS users

@@ -109,7 +77,7 @@ Ctrl can also be replaced with Cmd instead for macOS users

 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

-### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)
+### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu121_or_cpu.7z)

 Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints

@@ -195,6 +163,20 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

+### I already have another UI for Stable Diffusion installed do I really have to install all of these dependencies?
+
+You don't. If you have another UI installed and working with its own python venv you can use that venv to run ComfyUI. You can open up your favorite terminal and activate it:
+
+```source path_to_other_sd_gui/venv/bin/activate```
+
+or on Windows:
+
+With Powershell: ```"path_to_other_sd_gui\venv\Scripts\Activate.ps1"```
+
+With cmd.exe: ```"path_to_other_sd_gui\venv\Scripts\activate.bat"```
+
+And then you can use that terminal to run ComfyUI without installing any dependencies. Note that the venv folder might be called something else depending on the SD UI.
+
 # Running

 ```python main.py```
@@ -230,7 +212,7 @@ To use a textual inversion concepts/embeddings in a text prompt put them in the

 Use ```--preview-method auto``` to enable previews.

-The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth, taesdxl_decoder.pth, taesd3_decoder.pth and taef1_decoder.pth](https://github.com/madebyollin/taesd/) and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI and launch it with `--preview-method taesd` to enable high-quality previews.
+The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) (for SD1.x and SD2.x) and [taesdxl_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesdxl_decoder.pth) (for SDXL) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.

 ## How to use TLS/SSL?
 Generate a self-signed certificate (not appropriate for shared/production use) and key by running the command: `openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname"`
@@ -246,47 +228,6 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w

 See also: [https://www.comfy.org/](https://www.comfy.org/)

-## Frontend Development
-
-As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
-
-### Reporting Issues and Requesting Features
-
-For any bugs, issues, or feature requests related to the frontend, please use the [ComfyUI Frontend repository](https://github.com/Comfy-Org/ComfyUI_frontend). This will help us manage and address frontend-specific concerns more efficiently.
-
-### Using the Latest Frontend
-
-The new frontend is now the default for ComfyUI. However, please note:
-
-1. The frontend in the main ComfyUI repository is updated weekly.
-2. Daily releases are available in the separate frontend repository.
-
-To use the most up-to-date frontend version:
-
-1. For the latest daily release, launch ComfyUI with this command line argument:
-
-   ```
-   --front-end-version Comfy-Org/ComfyUI_frontend@latest
-   ```
-
-2. For a specific version, replace `latest` with the desired version number:
-
-   ```
-   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
-   ```
-
-This approach allows you to easily switch between the stable weekly release and the cutting-edge daily updates, or even specific versions for testing purposes.
-
-### Accessing the Legacy Frontend
-
-If you need to use the legacy frontend for any reason, you can access it using the following command line argument:
-
-```
--front-end-version Comfy-Org/ComfyUI_legacy_frontend@latest
-```
-
-This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy Frontend repository](https://github.com/Comfy-Org/ComfyUI_legacy_frontend).
-
 # QA

 ### Which GPU should I buy for this?
--- a/api_server/init.py
+++ b/api_server/init.py
--- a/api_server/routes/init.py
+++ b/api_server/routes/init.py
--- a/api_server/routes/internal/README.md
+++ b/api_server/routes/internal/README.md
@@ -1,3 +0,0 @@
-# ComfyUI Internal Routes
-
-All routes under the `/internal` path are designated for **internal use by ComfyUI only**. These routes are not intended for use by external applications may change at any time without notice.
--- a/api_server/routes/internal/init.py
+++ b/api_server/routes/internal/init.py
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@@ -1,40 +0,0 @@
-from aiohttp import web
-from typing import Optional
-from folder_paths import models_dir, user_directory, output_directory
-from api_server.services.file_service import FileService
-
-class InternalRoutes:
-    '''
-    The top level web router for internal routes: /internal/*
-    The endpoints here should NOT be depended upon. It is for ComfyUI frontend use only.
-    Check README.md for more information.
-    
-    '''
-    def __init__(self):
-        self.routes: web.RouteTableDef = web.RouteTableDef()
-        self._app: Optional[web.Application] = None
-        self.file_service = FileService({
-            "models": models_dir,
-            "user": user_directory,
-            "output": output_directory
-        })
-
-    def setup_routes(self):
-        @self.routes.get('/files')
-        async def list_files(request):
-            directory_key = request.query.get('directory', '')
-            try:
-                file_list = self.file_service.list_files(directory_key)
-                return web.json_response({"files": file_list})
-            except ValueError as e:
-                return web.json_response({"error": str(e)}, status=400)
-            except Exception as e:
-                return web.json_response({"error": str(e)}, status=500)
-
-
-    def get_app(self):
-        if self._app is None:
-            self._app = web.Application()
-            self.setup_routes()
-            self._app.add_routes(self.routes)
-        return self._app
--- a/api_server/services/init.py
+++ b/api_server/services/init.py
--- a/api_server/services/file_service.py
+++ b/api_server/services/file_service.py
@@ -1,13 +0,0 @@
-from typing import Dict, List, Optional
-from api_server.utils.file_operations import FileSystemOperations, FileSystemItem
-
-class FileService:
-    def __init__(self, allowed_directories: Dict[str, str], file_system_ops: Optional[FileSystemOperations] = None):
-        self.allowed_directories: Dict[str, str] = allowed_directories
-        self.file_system_ops: FileSystemOperations = file_system_ops or FileSystemOperations()
-
-    def list_files(self, directory_key: str) -> List[FileSystemItem]:
-        if directory_key not in self.allowed_directories:
-            raise ValueError("Invalid directory key")
-        directory_path: str = self.allowed_directories[directory_key]
-        return self.file_system_ops.walk_directory(directory_path)
--- a/api_server/utils/file_operations.py
+++ b/api_server/utils/file_operations.py
@@ -1,42 +0,0 @@
-import os
-from typing import List, Union, TypedDict, Literal
-from typing_extensions import TypeGuard
-class FileInfo(TypedDict):
-    name: str
-    path: str
-    type: Literal["file"]
-    size: int
-
-class DirectoryInfo(TypedDict):
-    name: str
-    path: str
-    type: Literal["directory"]
-
-FileSystemItem = Union[FileInfo, DirectoryInfo]
-
-def is_file_info(item: FileSystemItem) -> TypeGuard[FileInfo]:
-    return item["type"] == "file"
-
-class FileSystemOperations:
-    @staticmethod
-    def walk_directory(directory: str) -> List[FileSystemItem]:
-        file_list: List[FileSystemItem] = []
-        for root, dirs, files in os.walk(directory):
-            for name in files:
-                file_path = os.path.join(root, name)
-                relative_path = os.path.relpath(file_path, directory)
-                file_list.append({
-                    "name": name,
-                    "path": relative_path,
-                    "type": "file",
-                    "size": os.path.getsize(file_path)
-                })
-            for name in dirs:
-                dir_path = os.path.join(root, name)
-                relative_path = os.path.relpath(dir_path, directory)
-                file_list.append({
-                    "name": name,
-                    "path": relative_path,
-                    "type": "directory"
-                })
-        return file_list
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -92,10 +92,6 @@ class LatentPreviewMethod(enum.Enum):

 parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)

-cache_group = parser.add_mutually_exclusive_group()
-cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
-cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
-
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
@@ -116,14 +112,10 @@ vram_group.add_argument("--lowvram", action="store_true", help="Split the unet i
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")

-parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reverved depending on your OS.")
-
-
 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")

 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
-parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")

 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
--- a/comfy/clip_config_bigg.json
+++ b/comfy/clip_config_bigg.json
@@ -5,7 +5,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 49407,
+  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_size": 1280,
  "initializer_factor": 1.0,
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -1,6 +1,5 @@
 import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
-import comfy.ops

 class CLIPAttention(torch.nn.Module):
    def __init__(self, embed_dim, heads, dtype, device, operations):
@@ -72,13 +71,13 @@ class CLIPEncoder(torch.nn.Module):
        return x, intermediate

 class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None, operations=None):
+    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
        super().__init__()
-        self.token_embedding = operations.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens, dtype=torch.float32):
-        return self.token_embedding(input_tokens, out_dtype=dtype) + comfy.ops.cast_to(self.position_embedding.weight, dtype=dtype, device=input_tokens.device)
+    def forward(self, input_tokens):
+        return self.token_embedding(input_tokens) + self.position_embedding.weight


 class CLIPTextModel_(torch.nn.Module):
@@ -88,16 +87,14 @@ class CLIPTextModel_(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]
-        num_positions = config_dict["max_position_embeddings"]
-        self.eos_token_id = config_dict["eos_token_id"]

        super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, num_positions=num_positions, dtype=dtype, device=device, operations=operations)
+        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
-        x = self.embeddings(input_tokens, dtype=dtype)
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
+        x = self.embeddings(input_tokens)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -114,7 +111,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)

-        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
        return x, i, pooled_output

 class CLIPTextModel(torch.nn.Module):
@@ -124,6 +121,7 @@ class CLIPTextModel(torch.nn.Module):
        self.text_model = CLIPTextModel_(config_dict, dtype, device, operations)
        embed_dim = config_dict["hidden_size"]
        self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
+        self.text_projection.weight.copy_(torch.eye(embed_dim))
        self.dtype = dtype

    def get_input_embeddings(self):
@@ -155,11 +153,11 @@ class CLIPVisionEmbeddings(torch.nn.Module):

        num_patches = (image_size // patch_size) ** 2
        num_positions = num_patches + 1
-        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)

    def forward(self, pixel_values):
        embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
-        return torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)
+        return torch.cat([self.class_embedding.to(embeds.device).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight.to(embeds.device)


 class CLIPVision(torch.nn.Module):
@@ -171,7 +169,7 @@ class CLIPVision(torch.nn.Module):
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]

-        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=dtype, device=device, operations=operations)
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
        self.pre_layrnorm = operations.LayerNorm(embed_dim)
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.post_layernorm = operations.LayerNorm(embed_dim)
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -1,24 +1,4 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Comfy
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-
-
 import torch
-from enum import Enum
 import math
 import os
 import logging
@@ -33,8 +13,6 @@ import comfy.cldm.cldm
 import comfy.t2i_adapter.adapter
 import comfy.ldm.cascade.controlnet
 import comfy.cldm.mmdit
-import comfy.ldm.hydit.controlnet
-import comfy.ldm.flux.controlnet_xlabs


 def broadcast_image_to(tensor, target_batch_size, batched_number):
@@ -55,10 +33,6 @@ def broadcast_image_to(tensor, target_batch_size, batched_number):
    else:
        return torch.cat([tensor] * batched_number, dim=0)

-class StrengthType(Enum):
-    CONSTANT = 1
-    LINEAR_UP = 2
-
 class ControlBase:
    def __init__(self, device=None):
        self.cond_hint_original = None
@@ -77,8 +51,6 @@ class ControlBase:
            device = comfy.model_management.get_torch_device()
        self.device = device
        self.previous_controlnet = None
-        self.extra_conds = []
-        self.strength_type = StrengthType.CONSTANT

    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None):
        self.cond_hint_original = cond_hint
@@ -121,8 +93,6 @@ class ControlBase:
        c.latent_format = self.latent_format
        c.extra_args = self.extra_args.copy()
        c.vae = self.vae
-        c.extra_conds = self.extra_conds.copy()
-        c.strength_type = self.strength_type

    def inference_memory_requirements(self, dtype):
        if self.previous_controlnet is not None:
@@ -143,10 +113,7 @@ class ControlBase:

                    if x not in applied_to: #memory saving strategy, allow shared tensors and only apply strength to shared tensors once
                        applied_to.add(x)
-                        if self.strength_type == StrengthType.CONSTANT:
-                            x *= self.strength
-                        elif self.strength_type == StrengthType.LINEAR_UP:
-                            x *= (self.strength ** float(len(control_output) - i))
+                        x *= self.strength

                    if x.dtype != output_dtype:
                        x = x.to(output_dtype)
@@ -175,7 +142,7 @@ class ControlBase:


 class ControlNet(ControlBase):
-    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT):
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None):
        super().__init__(device)
        self.control_model = control_model
        self.load_device = load_device
@@ -187,8 +154,6 @@ class ControlNet(ControlBase):
        self.model_sampling_current = None
        self.manual_cast_dtype = manual_cast_dtype
        self.latent_format = latent_format
-        self.extra_conds += extra_conds
-        self.strength_type = strength_type

    def get_control(self, x_noisy, t, cond, batched_number):
        control_prev = None
@@ -226,16 +191,13 @@ class ControlNet(ControlBase):
            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)

        context = cond.get('crossattn_controlnet', cond['c_crossattn'])
-        extra = self.extra_args.copy()
-        for c in self.extra_conds:
-            temp = cond.get(c, None)
-            if temp is not None:
-                extra[c] = temp.to(dtype)
-
+        y = cond.get('y', None)
+        if y is not None:
+            y = y.to(dtype)
        timestep = self.model_sampling_current.timestep(t)
        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)

-        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra)
+        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(), context=context.to(dtype), y=y, **self.extra_args)
        return self.control_merge(control, control_prev, output_dtype)

    def copy(self):
@@ -324,7 +286,6 @@ class ControlLora(ControlNet):
        ControlBase.__init__(self, device)
        self.control_weights = control_weights
        self.global_average_pooling = global_average_pooling
-        self.extra_conds += ["y"]

    def pre_run(self, model, percent_to_timestep_function):
        super().pre_run(model, percent_to_timestep_function)
@@ -377,8 +338,12 @@ class ControlLora(ControlNet):
    def inference_memory_requirements(self, dtype):
        return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)

-def controlnet_config(sd):
-    model_config = comfy.model_detection.model_config_from_unet(sd, "", True)
+def load_controlnet_mmdit(sd):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config = comfy.model_detection.model_config_from_unet(new_sd, "", True)
+    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
+    for k in sd:
+        new_sd[k] = sd[k]

    supported_inference_dtypes = model_config.supported_inference_dtypes

@@ -391,28 +356,14 @@ def controlnet_config(sd):
    else:
        operations = comfy.ops.disable_weight_init

-    offload_device = comfy.model_management.unet_offload_device()
-    return model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device
-
-def controlnet_load_state_dict(control_model, sd):
-    missing, unexpected = control_model.load_state_dict(sd, strict=False)
+    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=load_device, dtype=unet_dtype, **controlnet_config)
+    missing, unexpected = control_model.load_state_dict(new_sd, strict=False)

    if len(missing) > 0:
        logging.warning("missing controlnet keys: {}".format(missing))

    if len(unexpected) > 0:
        logging.debug("unexpected controlnet keys: {}".format(unexpected))
-    return control_model
-
-def load_controlnet_mmdit(sd):
-    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd)
-    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
-    for k in sd:
-        new_sd[k] = sd[k]
-
-    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
-    control_model = controlnet_load_state_dict(control_model, new_sd)

    latent_format = comfy.latent_formats.SD3()
    latent_format.shift_factor = 0 #SD3 controlnet weirdness
@@ -420,31 +371,8 @@ def load_controlnet_mmdit(sd):
    return control


-def load_controlnet_hunyuandit(controlnet_data):
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data)
-
-    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=offload_device, dtype=unet_dtype)
-    control_model = controlnet_load_state_dict(control_model, controlnet_data)
-
-    latent_format = comfy.latent_formats.SDXL()
-    extra_conds = ['text_embedding_mask', 'encoder_hidden_states_t5', 'text_embedding_mask_t5', 'image_meta_size', 'style', 'cos_cis_img', 'sin_cis_img']
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds, strength_type=StrengthType.CONSTANT)
-    return control
-
-def load_controlnet_flux_xlabs(sd):
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd)
-    control_model = comfy.ldm.flux.controlnet_xlabs.ControlNetFlux(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
-    control_model = controlnet_load_state_dict(control_model, sd)
-    extra_conds = ['y', 'guidance']
-    control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
-    return control
-
-
 def load_controlnet(ckpt_path, model=None):
    controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
-    if 'after_proj_list.18.bias' in controlnet_data.keys(): #Hunyuan DiT
-        return load_controlnet_hunyuandit(controlnet_data)
-
    if "lora_controlnet" in controlnet_data:
        return ControlLora(controlnet_data)

@@ -502,10 +430,7 @@ def load_controlnet(ckpt_path, model=None):
            logging.warning("leftover keys: {}".format(leftover_keys))
        controlnet_data = new_sd
    elif "controlnet_blocks.0.weight" in controlnet_data: #SD3 diffusers format
-        if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
-            return load_controlnet_flux_xlabs(controlnet_data)
-        else:
-            return load_controlnet_mmdit(controlnet_data)
+        return load_controlnet_mmdit(controlnet_data)

    pth_key = 'control_model.zero_convs.0.0.weight'
    pth = False
@@ -537,7 +462,6 @@ def load_controlnet(ckpt_path, model=None):
    if manual_cast_dtype is not None:
        controlnet_config["operations"] = comfy.ops.manual_cast
    controlnet_config["dtype"] = unet_dtype
-    controlnet_config["device"] = comfy.model_management.unet_offload_device()
    controlnet_config.pop("out_channels")
    controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
    control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
--- a/comfy/diffusers_load.py
+++ b/comfy/diffusers_load.py
@@ -22,7 +22,7 @@ def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_dire
    if text_encoder2_path is not None:
        text_encoder_paths.append(text_encoder2_path)

-    unet = comfy.sd.load_diffusion_model(unet_path)
+    unet = comfy.sd.load_unet(unet_path)

    clip = None
    if output_clip:
--- a/comfy/float.py
+++ b/comfy/float.py
@@ -1,59 +0,0 @@
-import torch
-
-#Not 100% sure about this
-def manual_stochastic_round_to_float8(x, dtype):
-    if dtype == torch.float8_e4m3fn:
-        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 4, 3, 7
-    elif dtype == torch.float8_e5m2:
-        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 5, 2, 15
-    else:
-        raise ValueError("Unsupported dtype")
-
-    sign = torch.sign(x)
-    abs_x = x.abs()
-
-    # Combine exponent calculation and clamping
-    exponent = torch.clamp(
-        torch.floor(torch.log2(abs_x)).to(torch.int32) + EXPONENT_BIAS,
-        0, 2**EXPONENT_BITS - 1
-    )
-
-    # Combine mantissa calculation and rounding
-    # min_normal = 2.0 ** (-EXPONENT_BIAS + 1)
-    # zero_mask = (abs_x == 0)
-    # subnormal_mask = (exponent == 0) & (abs_x != 0)
-    normal_mask = ~(exponent == 0)
-
-    mantissa_scaled = torch.where(
-        normal_mask,
-        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
-        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
-    )
-    mantissa_floor = mantissa_scaled.floor()
-    mantissa = torch.where(
-        torch.rand_like(mantissa_scaled) < (mantissa_scaled - mantissa_floor),
-        (mantissa_floor + 1) / (2**MANTISSA_BITS),
-        mantissa_floor / (2**MANTISSA_BITS)
-    )
-    result = torch.where(
-        normal_mask,
-        sign * (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + mantissa),
-        sign * (2.0 ** (-EXPONENT_BIAS + 1)) * mantissa
-    )
-
-    result = torch.where(abs_x == 0, 0, result)
-    return result.to(dtype=dtype)
-
-
-
-def stochastic_rounding(value, dtype):
-    if dtype == torch.float32:
-        return value.to(dtype=torch.float32)
-    if dtype == torch.float16:
-        return value.to(dtype=torch.float16)
-    if dtype == torch.bfloat16:
-        return value.to(dtype=torch.bfloat16)
-    if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
-        return manual_stochastic_round_to_float8(value, dtype)
-
-    return value.to(dtype=dtype)
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -9,7 +9,6 @@ from tqdm.auto import trange, tqdm
 from . import utils
 from . import deis
 import comfy.model_patcher
-import comfy.model_sampling

 def append_zero(x):
    return torch.cat([x, x.new_zeros([1])])
@@ -510,9 +509,6 @@ def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callbac

@torch.no_grad()
 def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    if isinstance(model.inner_model.inner_model.model_sampling, comfy.model_sampling.CONST):
-        return sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
-
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
@@ -545,55 +541,6 @@ def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None,
    return x


-@torch.no_grad()
-def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
-    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
-    lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
-
-    # logged_x = x.unsqueeze(0)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        downstep_ratio = 1 + (sigmas[i+1]/sigmas[i] - 1) * eta
-        sigma_down = sigmas[i+1] * downstep_ratio
-        alpha_ip1 = 1 - sigmas[i+1]
-        alpha_down = 1 - sigma_down
-        renoise_coeff = (sigmas[i+1]**2 - sigma_down**2*alpha_ip1**2/alpha_down**2)**0.5
-        # sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Euler method
-            d = to_d(x, sigmas[i], denoised)
-            dt = sigma_down - sigmas[i]
-            x = x + d * dt
-        else:
-            # DPM-Solver++(2S)
-            if sigmas[i] == 1.0:
-                sigma_s = 0.9999
-            else:
-                t_i, t_down = lambda_fn(sigmas[i]), lambda_fn(sigma_down)
-                r = 1 / 2
-                h = t_down - t_i
-                s = t_i + r * h
-                sigma_s = sigma_fn(s)
-            # sigma_s = sigmas[i+1]
-            sigma_s_i_ratio = sigma_s / sigmas[i]
-            u = sigma_s_i_ratio * x + (1 - sigma_s_i_ratio) * denoised
-            D_i = model(u, sigma_s * s_in, **extra_args)
-            sigma_down_i_ratio = sigma_down / sigmas[i]
-            x = sigma_down_i_ratio * x + (1 - sigma_down_i_ratio) * D_i
-            # print("sigma_i", sigmas[i], "sigma_ip1", sigmas[i+1],"sigma_down", sigma_down, "sigma_down_i_ratio", sigma_down_i_ratio, "sigma_s_i_ratio", sigma_s_i_ratio, "renoise_coeff", renoise_coeff)
-        # Noise addition
-        if sigmas[i + 1] > 0 and eta > 0:
-            x = (alpha_ip1/alpha_down) * x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * renoise_coeff
-        # logged_x = torch.cat((logged_x, x.unsqueeze(0)), dim=0)
-    return x
-
@torch.no_grad()
 def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    """DPM-Solver++ (stochastic)."""
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -139,34 +139,3 @@ class SD3(LatentFormat):

 class StableAudio1(LatentFormat):
    latent_channels = 64
-
-class Flux(SD3):
-    latent_channels = 16
-    def __init__(self):
-        self.scale_factor = 0.3611
-        self.shift_factor = 0.1159
-        self.latent_rgb_factors =[
-            [-0.0404,  0.0159,  0.0609],
-            [ 0.0043,  0.0298,  0.0850],
-            [ 0.0328, -0.0749, -0.0503],
-            [-0.0245,  0.0085,  0.0549],
-            [ 0.0966,  0.0894,  0.0530],
-            [ 0.0035,  0.0399,  0.0123],
-            [ 0.0583,  0.1184,  0.1262],
-            [-0.0191, -0.0206, -0.0306],
-            [-0.0324,  0.0055,  0.1001],
-            [ 0.0955,  0.0659, -0.0545],
-            [-0.0504,  0.0231, -0.0013],
-            [ 0.0500, -0.0008, -0.0088],
-            [ 0.0982,  0.0941,  0.0976],
-            [-0.1233, -0.0280, -0.0897],
-            [-0.0005, -0.0530, -0.0020],
-            [-0.1273, -0.0932, -0.0680]
-        ]
-        self.taesd_decoder_name = "taef1_decoder"
-
-    def process_in(self, latent):
-        return (latent - self.shift_factor) * self.scale_factor
-
-    def process_out(self, latent):
-        return (latent / self.scale_factor) + self.shift_factor
--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@@ -9,7 +9,6 @@ from einops import rearrange
 from torch import nn
 from torch.nn import functional as F
 import math
-import comfy.ops

 class FourierFeatures(nn.Module):
    def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
@@ -19,7 +18,7 @@ class FourierFeatures(nn.Module):
            [out_features // 2, in_features], dtype=dtype, device=device))

    def forward(self, input):
-        f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
+        f = 2 * math.pi * input @ self.weight.T.to(dtype=input.dtype, device=input.device)
        return torch.cat([f.cos(), f.sin()], dim=-1)

 # norms
@@ -39,9 +38,9 @@ class LayerNorm(nn.Module):

    def forward(self, x):
        beta = self.beta
-        if beta is not None:
-            beta = comfy.ops.cast_to_input(beta, x)
-        return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)
+        if self.beta is not None:
+            beta = beta.to(dtype=x.dtype, device=x.device)
+        return F.layer_norm(x, x.shape[-1:], weight=self.gamma.to(dtype=x.dtype, device=x.device), bias=beta)

 class GLU(nn.Module):
    def __init__(
@@ -124,9 +123,7 @@ class RotaryEmbedding(nn.Module):
        scale_base = 512,
        interpolation_factor = 1.,
        base = 10000,
-        base_rescale_factor = 1.,
-        dtype=None,
-        device=None,
+        base_rescale_factor = 1.
    ):
        super().__init__()
        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
@@ -134,8 +131,8 @@ class RotaryEmbedding(nn.Module):
        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
        base *= base_rescale_factor ** (dim / (dim - 2))

-        # inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', torch.empty((dim // 2,), device=device, dtype=dtype))
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)

        assert interpolation_factor >= 1.
        self.interpolation_factor = interpolation_factor
@@ -164,14 +161,14 @@ class RotaryEmbedding(nn.Module):

        t = t / self.interpolation_factor

-        freqs = torch.einsum('i , j -> i j', t, comfy.ops.cast_to_input(self.inv_freq, t))
+        freqs = torch.einsum('i , j -> i j', t, self.inv_freq.to(dtype=dtype, device=device))
        freqs = torch.cat((freqs, freqs), dim = -1)

        if self.scale is None:
            return freqs, 1.

        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
-        scale = comfy.ops.cast_to_input(self.scale, t) ** rearrange(power, 'n -> n 1')
+        scale = self.scale.to(dtype=dtype, device=device) ** rearrange(power, 'n -> n 1')
        scale = torch.cat((scale, scale), dim = -1)

        return freqs, scale
@@ -571,7 +568,7 @@ class ContinuousTransformer(nn.Module):
        self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()

        if rotary_pos_emb:
-            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32), device=device, dtype=dtype)
+            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32))
        else:
            self.rotary_pos_emb = None

--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@@ -8,8 +8,6 @@ import torch.nn as nn
 import torch.nn.functional as F

 from comfy.ldm.modules.attention import optimized_attention
-import comfy.ops
-import comfy.ldm.common_dit

 def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
@@ -408,7 +406,10 @@ class MMDiT(nn.Module):

    def patchify(self, x):
        B, C, H, W = x.size()
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+        pad_h = (self.patch_size - H % self.patch_size) % self.patch_size
+        pad_w = (self.patch_size - W % self.patch_size) % self.patch_size
+
+        x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
        x = x.view(
            B,
            C,
@@ -426,7 +427,7 @@ class MMDiT(nn.Module):
        max_dim = max(h, w)

        cur_dim = self.h_max
-        pos_encoding = comfy.ops.cast_to_input(self.positional_encoding.reshape(1, cur_dim, cur_dim, -1), x)
+        pos_encoding = self.positional_encoding.reshape(1, cur_dim, cur_dim, -1).to(device=x.device, dtype=x.dtype)

        if max_dim > cur_dim:
            pos_encoding = F.interpolate(pos_encoding.movedim(-1, 1), (max_dim, max_dim), mode="bilinear").movedim(1, -1)
@@ -454,7 +455,7 @@ class MMDiT(nn.Module):
        t = timestep

        c = self.cond_seq_linear(c_seq)  # B, T_c, D
-        c = torch.cat([comfy.ops.cast_to_input(self.register_tokens, c).repeat(c.size(0), 1, 1), c], dim=1)
+        c = torch.cat([self.register_tokens.to(device=c.device, dtype=c.dtype).repeat(c.size(0), 1, 1), c], dim=1)

        global_cond = self.t_embedder(t, x.dtype)  # B, D

--- a/comfy/ldm/cascade/common.py
+++ b/comfy/ldm/cascade/common.py
@@ -19,7 +19,14 @@
 import torch
 import torch.nn as nn
 from comfy.ldm.modules.attention import optimized_attention
-import comfy.ops
+
+class Linear(torch.nn.Linear):
+    def reset_parameters(self):
+        return None
+
+class Conv2d(torch.nn.Conv2d):
+    def reset_parameters(self):
+        return None

 class OptimizedAttention(nn.Module):
    def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
@@ -71,13 +78,13 @@ class GlobalResponseNorm(nn.Module):
    "from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105"
    def __init__(self, dim, dtype=None, device=None):
        super().__init__()
-        self.gamma = nn.Parameter(torch.empty(1, 1, 1, dim, dtype=dtype, device=device))
-        self.beta = nn.Parameter(torch.empty(1, 1, 1, dim, dtype=dtype, device=device))
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))

    def forward(self, x):
        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
-        return comfy.ops.cast_to_input(self.gamma, x) * (x * Nx) + comfy.ops.cast_to_input(self.beta, x) + x
+        return self.gamma.to(device=x.device, dtype=x.dtype) * (x * Nx) + self.beta.to(device=x.device, dtype=x.dtype) + x


 class ResBlock(nn.Module):
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@@ -1,8 +0,0 @@
-import torch
-
-def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
-    if padding_mode == "circular" and torch.jit.is_tracing() or torch.jit.is_scripting():
-        padding_mode = "reflect"
-    pad_h = (patch_size[0] - img.shape[-2] % patch_size[0]) % patch_size[0]
-    pad_w = (patch_size[1] - img.shape[-1] % patch_size[1]) % patch_size[1]
-    return torch.nn.functional.pad(img, (0, pad_w, 0, pad_h), mode=padding_mode)
--- a/comfy/ldm/flux/controlnet_xlabs.py
+++ b/comfy/ldm/flux/controlnet_xlabs.py
@@ -1,104 +0,0 @@
-#Original code can be found on: https://github.com/XLabs-AI/x-flux/blob/main/src/flux/controlnet.py
-
-import torch
-from torch import Tensor, nn
-from einops import rearrange, repeat
-
-from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
-                                 MLPEmbedder, SingleStreamBlock,
-                                 timestep_embedding)
-
-from .model import Flux
-import comfy.ldm.common_dit
-
-
-class ControlNetFlux(Flux):
-    def __init__(self, image_model=None, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs)
-
-        # add ControlNet blocks
-        self.controlnet_blocks = nn.ModuleList([])
-        for _ in range(self.params.depth):
-            controlnet_block = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
-            # controlnet_block = zero_module(controlnet_block)
-            self.controlnet_blocks.append(controlnet_block)
-        self.pos_embed_input = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.gradient_checkpointing = False
-        self.input_hint_block = nn.Sequential(
-            operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
-        )
-
-    def forward_orig(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        controlnet_cond: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-        guidance: Tensor = None,
-    ) -> Tensor:
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-
-        # running on sequences img
-        img = self.img_in(img)
-        controlnet_cond = self.input_hint_block(controlnet_cond)
-        controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-        controlnet_cond = self.pos_embed_input(controlnet_cond)
-        img = img + controlnet_cond
-        vec = self.time_in(timestep_embedding(timesteps, 256))
-        if self.params.guidance_embed:
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        block_res_samples = ()
-
-        for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
-            block_res_samples = block_res_samples + (img,)
-
-        controlnet_block_res_samples = ()
-        for block_res_sample, controlnet_block in zip(block_res_samples, self.controlnet_blocks):
-            block_res_sample = controlnet_block(block_res_sample)
-            controlnet_block_res_samples = controlnet_block_res_samples + (block_res_sample,)
-
-        return {"input": (controlnet_block_res_samples * 10)[:19]}
-
-    def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
-        hint = hint * 2.0 - 1.0
-
-        bs, c, h, w = x.shape
-        patch_size = 2
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
-
-        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-
-        h_len = ((h + (patch_size // 2)) // patch_size)
-        w_len = ((w + (patch_size // 2)) // patch_size)
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[..., 1] = img_ids[..., 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype)[:, None]
-        img_ids[..., 2] = img_ids[..., 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype)[None, :]
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
-
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        return self.forward_orig(img, img_ids, hint, context, txt_ids, timesteps, y, guidance)
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -1,251 +0,0 @@
-import math
-from dataclasses import dataclass
-
-import torch
-from torch import Tensor, nn
-
-from .math import attention, rope
-import comfy.ops
-
-
-class EmbedND(nn.Module):
-    def __init__(self, dim: int, theta: int, axes_dim: list):
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        self.axes_dim = axes_dim
-
-    def forward(self, ids: Tensor) -> Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
-            dim=-3,
-        )
-
-        return emb.unsqueeze(1)
-
-
-def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
-    """
-    Create sinusoidal timestep embeddings.
-    :param t: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an (N, D) Tensor of positional embeddings.
-    """
-    t = time_factor * t
-    half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
-
-    args = t[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    if torch.is_floating_point(t):
-        embedding = embedding.to(t)
-    return embedding
-
-class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
-        self.silu = nn.SiLU()
-        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self.out_layer(self.silu(self.in_layer(x)))
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
-
-    def forward(self, x: Tensor):
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms).to(dtype=x_dtype) * comfy.ops.cast_to(self.scale, dtype=x_dtype, device=x.device)
-
-
-class QKNorm(torch.nn.Module):
-    def __init__(self, dim: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.query_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
-        self.key_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple:
-        q = self.query_norm(q)
-        k = self.key_norm(k)
-        return q.to(v), k.to(v)
-
-
-class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-
-        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
-        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
-
-
-@dataclass
-class ModulationOut:
-    shift: Tensor
-    scale: Tensor
-    gate: Tensor
-
-
-class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.is_double = double
-        self.multiplier = 6 if double else 3
-        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
-
-    def forward(self, vec: Tensor) -> tuple:
-        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
-
-        return (
-            ModulationOut(*out[:3]),
-            ModulationOut(*out[3:]) if self.is_double else None,
-        )
-
-
-class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, dtype=None, device=None, operations=None):
-        super().__init__()
-
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
-        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-        self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
-        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
-
-        # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-
-        # run actual attention
-        attn = attention(torch.cat((txt_q, img_q), dim=2),
-                         torch.cat((txt_k, img_k), dim=2),
-                         torch.cat((txt_v, img_v), dim=2), pe=pe)
-
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-
-        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
-
-        # calculate the txt bloks
-        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
-
-        if txt.dtype == torch.float16:
-            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
-
-        return img, txt
-
-
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float = None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
-        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
-
-        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-
-        self.hidden_size = hidden_size
-        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-
-        self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
-        mod, _ = self.modulation(vec)
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-
-        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k = self.norm(q, k, v)
-
-        # compute attention
-        attn = attention(q, k, v, pe=pe)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += mod.gate * output
-        if x.dtype == torch.float16:
-            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
-        return x
-
-
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
-
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-        x = self.linear(x)
-        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -1,35 +0,0 @@
-import torch
-from einops import rearrange
-from torch import Tensor
-from comfy.ldm.modules.attention import optimized_attention
-import comfy.model_management
-
-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
-    q, k = apply_rope(q, k, pe)
-
-    heads = q.shape[1]
-    x = optimized_attention(q, k, v, heads, skip_reshape=True)
-    return x
-
-
-def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
-    assert dim % 2 == 0
-    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu():
-        device = torch.device("cpu")
-    else:
-        device = pos.device
-
-    scale = torch.linspace(0, (dim - 2) / dim, steps=dim//2, dtype=torch.float64, device=device)
-    omega = 1.0 / (theta**scale)
-    out = torch.einsum("...n,d->...nd", pos.to(dtype=torch.float32, device=device), omega)
-    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
-    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
-    return out.to(dtype=torch.float32, device=pos.device)
-
-
-def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -1,160 +0,0 @@
-#Original code can be found on: https://github.com/black-forest-labs/flux
-
-from dataclasses import dataclass
-
-import torch
-from torch import Tensor, nn
-
-from .layers import (
-    DoubleStreamBlock,
-    EmbedND,
-    LastLayer,
-    MLPEmbedder,
-    SingleStreamBlock,
-    timestep_embedding,
-)
-
-from einops import rearrange, repeat
-import comfy.ldm.common_dit
-
-@dataclass
-class FluxParams:
-    in_channels: int
-    vec_in_dim: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list
-    theta: int
-    qkv_bias: bool
-    guidance_embed: bool
-
-
-class Flux(nn.Module):
-    """
-    Transformer model for flow matching on sequences.
-    """
-
-    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        self.dtype = dtype
-        params = FluxParams(**kwargs)
-        self.params = params
-        self.in_channels = params.in_channels * 2 * 2
-        self.out_channels = self.in_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
-        )
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
-
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(params.depth)
-            ]
-        )
-
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-
-        if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
-
-    def forward_orig(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-        guidance: Tensor = None,
-        control=None,
-    ) -> Tensor:
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-
-        # running on sequences img
-        img = self.img_in(img)
-        vec = self.time_in(timestep_embedding(timesteps, 256).to(img.dtype))
-        if self.params.guidance_embed:
-            if guidance is None:
-                raise ValueError("Didn't get guidance strength for guidance distilled model.")
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
-
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        for i, block in enumerate(self.double_blocks):
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
-
-            if control is not None: # Controlnet
-                control_i = control.get("input")
-                if i < len(control_i):
-                    add = control_i[i]
-                    if add is not None:
-                        img += add
-
-        img = torch.cat((txt, img), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            img = block(img, vec=vec, pe=pe)
-
-            if control is not None: # Controlnet
-                control_o = control.get("output")
-                if i < len(control_o):
-                    add = control_o[i]
-                    if add is not None:
-                        img[:, txt.shape[1] :, ...] += add
-
-        img = img[:, txt.shape[1] :, ...]
-
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-        return img
-
-    def forward(self, x, timestep, context, y, guidance, control=None, **kwargs):
-        bs, c, h, w = x.shape
-        patch_size = 2
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
-
-        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-
-        h_len = ((h + (patch_size // 2)) // patch_size)
-        w_len = ((w + (patch_size // 2)) // patch_size)
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[..., 1] = img_ids[..., 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype)[:, None]
-        img_ids[..., 2] = img_ids[..., 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype)[None, :]
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
-
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control)
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/hydit/attn_layers.py
+++ b/comfy/ldm/hydit/attn_layers.py
@@ -1,218 +0,0 @@
-import torch
-import torch.nn as nn
-from typing import Tuple, Union, Optional
-from comfy.ldm.modules.attention import optimized_attention
-
-
-def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x: torch.Tensor, head_first=False):
-    """
-    Reshape frequency tensor for broadcasting it with another tensor.
-
-    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
-    for the purpose of broadcasting the frequency tensor during element-wise operations.
-
-    Args:
-        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
-        x (torch.Tensor): Target tensor for broadcasting compatibility.
-        head_first (bool): head dimension first (except batch dim) or not.
-
-    Returns:
-        torch.Tensor: Reshaped frequency tensor.
-
-    Raises:
-        AssertionError: If the frequency tensor doesn't match the expected shape.
-        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
-    """
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-
-    if isinstance(freqs_cis, tuple):
-        # freqs_cis: (cos, sin) in real space
-        if head_first:
-            assert freqs_cis[0].shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
-            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        else:
-            assert freqs_cis[0].shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
-            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
-    else:
-        # freqs_cis: values in complex space
-        if head_first:
-            assert freqs_cis.shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
-            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        else:
-            assert freqs_cis.shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
-            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        return freqs_cis.view(*shape)
-
-
-def rotate_half(x):
-    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-
-
-def apply_rotary_emb(
-        xq: torch.Tensor,
-        xk: Optional[torch.Tensor],
-        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-        head_first: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor.
-
-    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
-    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
-    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
-    returned as real tensors.
-
-    Args:
-        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
-        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
-        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Precomputed frequency tensor for complex exponentials.
-        head_first (bool): head dimension first (except batch dim) or not.
-
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-
-    """
-    xk_out = None
-    if isinstance(freqs_cis, tuple):
-        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)    # [S, D]
-        xq_out = (xq * cos + rotate_half(xq) * sin)
-        if xk is not None:
-            xk_out = (xk * cos + rotate_half(xk) * sin)
-    else:
-        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
-        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)   # [S, D//2] --> [1, S, 1, D//2]
-        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
-        if xk is not None:
-            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # [B, S, H, D//2]
-            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
-
-    return xq_out, xk_out
-
-
-
-class CrossAttention(nn.Module):
-    """
-    Use QK Normalization.
-    """
-    def __init__(self,
-                 qdim,
-                 kdim,
-                 num_heads,
-                 qkv_bias=True,
-                 qk_norm=False,
-                 attn_drop=0.0,
-                 proj_drop=0.0,
-                 attn_precision=None,
-                 device=None,
-                 dtype=None,
-                 operations=None,
-                 ):
-        factory_kwargs = {'device': device, 'dtype': dtype}
-        super().__init__()
-        self.attn_precision = attn_precision
-        self.qdim = qdim
-        self.kdim = kdim
-        self.num_heads = num_heads
-        assert self.qdim % num_heads == 0, "self.qdim must be divisible by num_heads"
-        self.head_dim = self.qdim // num_heads
-        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
-        self.scale = self.head_dim ** -0.5
-
-        self.q_proj = operations.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
-        self.kv_proj = operations.Linear(kdim, 2 * qdim, bias=qkv_bias, **factory_kwargs)
-
-        # TODO: eps should be 1 / 65530 if using fp16
-        self.q_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
-        self.k_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.out_proj = operations.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, y, freqs_cis_img=None):
-        """
-        Parameters
-        ----------
-        x: torch.Tensor
-            (batch, seqlen1, hidden_dim) (where hidden_dim = num heads * head dim)
-        y: torch.Tensor
-            (batch, seqlen2, hidden_dim2)
-        freqs_cis_img: torch.Tensor
-            (batch, hidden_dim // 2), RoPE for image
-        """
-        b, s1, c = x.shape     # [b, s1, D]
-        _, s2, c = y.shape     # [b, s2, 1024]
-
-        q = self.q_proj(x).view(b, s1, self.num_heads, self.head_dim)   # [b, s1, h, d]
-        kv = self.kv_proj(y).view(b, s2, 2, self.num_heads, self.head_dim)    # [b, s2, 2, h, d]
-        k, v = kv.unbind(dim=2) # [b, s, h, d]
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-
-        # Apply RoPE if needed
-        if freqs_cis_img is not None:
-            qq, _ = apply_rotary_emb(q, None, freqs_cis_img)
-            assert qq.shape == q.shape, f'qq: {qq.shape}, q: {q.shape}'
-            q = qq
-
-        q = q.transpose(-2, -3).contiguous()        # q ->  B, L1, H, C - B, H, L1, C
-        k = k.transpose(-2, -3).contiguous()      # k ->  B, L2, H, C - B, H, C, L2
-        v = v.transpose(-2, -3).contiguous() 
-
-        context = optimized_attention(q, k, v, self.num_heads, skip_reshape=True, attn_precision=self.attn_precision)
-
-        out = self.out_proj(context)  # context.reshape - B, L1, -1
-        out = self.proj_drop(out)
-
-        out_tuple = (out,)
-
-        return out_tuple
-
-
-class Attention(nn.Module):
-    """
-    We rename some layer names to align with flash attention
-    """
-    def __init__(self, dim, num_heads, qkv_bias=True, qk_norm=False, attn_drop=0., proj_drop=0., attn_precision=None, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.attn_precision = attn_precision
-        self.dim = dim
-        self.num_heads = num_heads
-        assert self.dim % num_heads == 0, 'dim should be divisible by num_heads'
-        self.head_dim = self.dim // num_heads
-        # This assertion is aligned with flash attention
-        assert self.head_dim % 8 == 0 and self.head_dim <= 128, "Only support head_dim <= 128 and divisible by 8"
-        self.scale = self.head_dim ** -0.5
-
-        # qkv --> Wqkv
-        self.Wqkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
-        # TODO: eps should be 1 / 65530 if using fp16
-        self.q_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
-        self.k_norm = operations.LayerNorm(self.head_dim, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device) if qk_norm else nn.Identity()
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.out_proj = operations.Linear(dim, dim, dtype=dtype, device=device)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, freqs_cis_img=None):
-        B, N, C = x.shape
-        qkv = self.Wqkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)   # [3, b, h, s, d]
-        q, k, v = qkv.unbind(0)     # [b, h, s, d]
-        q = self.q_norm(q)          # [b, h, s, d]
-        k = self.k_norm(k)          # [b, h, s, d]
-
-        # Apply RoPE if needed
-        if freqs_cis_img is not None:
-            qq, kk = apply_rotary_emb(q, k, freqs_cis_img, head_first=True)
-            assert qq.shape == q.shape and kk.shape == k.shape, \
-                f'qq: {qq.shape}, q: {q.shape}, kk: {kk.shape}, k: {k.shape}'
-            q, k = qq, kk
-
-        x = optimized_attention(q, k, v, self.num_heads, skip_reshape=True, attn_precision=self.attn_precision)
-        x = self.out_proj(x)
-        x = self.proj_drop(x)
-
-        out_tuple = (x,)
-
-        return out_tuple
--- a/comfy/ldm/hydit/controlnet.py
+++ b/comfy/ldm/hydit/controlnet.py
@@ -1,321 +0,0 @@
-from typing import Any, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from torch.utils import checkpoint
-
-from comfy.ldm.modules.diffusionmodules.mmdit import (
-    Mlp,
-    TimestepEmbedder,
-    PatchEmbed,
-    RMSNorm,
-)
-from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
-from .poolers import AttentionPool
-
-import comfy.latent_formats
-from .models import HunYuanDiTBlock, calc_rope
-
-from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop
-
-
-class HunYuanControlNet(nn.Module):
-    """
-    HunYuanDiT: Diffusion model with a Transformer backbone.
-
-    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
-
-    Inherit PeftAdapterMixin to be compatible with the PEFT training pipeline.
-
-    Parameters
-    ----------
-    args: argparse.Namespace
-        The arguments parsed by argparse.
-    input_size: tuple
-        The size of the input image.
-    patch_size: int
-        The size of the patch.
-    in_channels: int
-        The number of input channels.
-    hidden_size: int
-        The hidden size of the transformer backbone.
-    depth: int
-        The number of transformer blocks.
-    num_heads: int
-        The number of attention heads.
-    mlp_ratio: float
-        The ratio of the hidden size of the MLP in the transformer block.
-    log_fn: callable
-        The logging function.
-    """
-
-    def __init__(
-        self,
-        input_size: tuple = 128,
-        patch_size: int = 2,
-        in_channels: int = 4,
-        hidden_size: int = 1408,
-        depth: int = 40,
-        num_heads: int = 16,
-        mlp_ratio: float = 4.3637,
-        text_states_dim=1024,
-        text_states_dim_t5=2048,
-        text_len=77,
-        text_len_t5=256,
-        qk_norm=True,  # See http://arxiv.org/abs/2302.05442 for details.
-        size_cond=False,
-        use_style_cond=False,
-        learn_sigma=True,
-        norm="layer",
-        log_fn: callable = print,
-        attn_precision=None,
-        dtype=None,
-        device=None,
-        operations=None,
-        **kwargs,
-    ):
-        super().__init__()
-        self.log_fn = log_fn
-        self.depth = depth
-        self.learn_sigma = learn_sigma
-        self.in_channels = in_channels
-        self.out_channels = in_channels * 2 if learn_sigma else in_channels
-        self.patch_size = patch_size
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.text_states_dim = text_states_dim
-        self.text_states_dim_t5 = text_states_dim_t5
-        self.text_len = text_len
-        self.text_len_t5 = text_len_t5
-        self.size_cond = size_cond
-        self.use_style_cond = use_style_cond
-        self.norm = norm
-        self.dtype = dtype
-        self.latent_format = comfy.latent_formats.SDXL
-
-        self.mlp_t5 = nn.Sequential(
-            nn.Linear(
-                self.text_states_dim_t5,
-                self.text_states_dim_t5 * 4,
-                bias=True,
-                dtype=dtype,
-                device=device,
-            ),
-            nn.SiLU(),
-            nn.Linear(
-                self.text_states_dim_t5 * 4,
-                self.text_states_dim,
-                bias=True,
-                dtype=dtype,
-                device=device,
-            ),
-        )
-        # learnable replace
-        self.text_embedding_padding = nn.Parameter(
-            torch.randn(
-                self.text_len + self.text_len_t5,
-                self.text_states_dim,
-                dtype=dtype,
-                device=device,
-            )
-        )
-
-        # Attention pooling
-        pooler_out_dim = 1024
-        self.pooler = AttentionPool(
-            self.text_len_t5,
-            self.text_states_dim_t5,
-            num_heads=8,
-            output_dim=pooler_out_dim,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        # Dimension of the extra input vectors
-        self.extra_in_dim = pooler_out_dim
-
-        if self.size_cond:
-            # Image size and crop size conditions
-            self.extra_in_dim += 6 * 256
-
-        if self.use_style_cond:
-            # Here we use a default learned embedder layer for future extension.
-            self.style_embedder = nn.Embedding(
-                1, hidden_size, dtype=dtype, device=device
-            )
-            self.extra_in_dim += hidden_size
-
-        # Text embedding for `add`
-        self.x_embedder = PatchEmbed(
-            input_size,
-            patch_size,
-            in_channels,
-            hidden_size,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.t_embedder = TimestepEmbedder(
-            hidden_size, dtype=dtype, device=device, operations=operations
-        )
-        self.extra_embedder = nn.Sequential(
-            operations.Linear(
-                self.extra_in_dim, hidden_size * 4, dtype=dtype, device=device
-            ),
-            nn.SiLU(),
-            operations.Linear(
-                hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device
-            ),
-        )
-
-        # Image embedding
-        num_patches = self.x_embedder.num_patches
-
-        # HUnYuanDiT Blocks
-        self.blocks = nn.ModuleList(
-            [
-                HunYuanDiTBlock(
-                    hidden_size=hidden_size,
-                    c_emb_size=hidden_size,
-                    num_heads=num_heads,
-                    mlp_ratio=mlp_ratio,
-                    text_states_dim=self.text_states_dim,
-                    qk_norm=qk_norm,
-                    norm_type=self.norm,
-                    skip=False,
-                    attn_precision=attn_precision,
-                    dtype=dtype,
-                    device=device,
-                    operations=operations,
-                )
-                for _ in range(19)
-            ]
-        )
-
-        # Input zero linear for the first block
-        self.before_proj = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
-
-
-        # Output zero linear for the every block
-        self.after_proj_list = nn.ModuleList(
-            [
-
-                    operations.Linear(
-                        self.hidden_size, self.hidden_size, dtype=dtype, device=device
-                    )
-                for _ in range(len(self.blocks))
-            ]
-        )
-
-    def forward(
-        self,
-        x,
-        hint,
-        timesteps,
-        context,#encoder_hidden_states=None,
-        text_embedding_mask=None,
-        encoder_hidden_states_t5=None,
-        text_embedding_mask_t5=None,
-        image_meta_size=None,
-        style=None,
-        return_dict=False,
-        **kwarg,
-    ):
-        """
-        Forward pass of the encoder.
-
-        Parameters
-        ----------
-        x: torch.Tensor
-            (B, D, H, W)
-        t: torch.Tensor
-            (B)
-        encoder_hidden_states: torch.Tensor
-            CLIP text embedding, (B, L_clip, D)
-        text_embedding_mask: torch.Tensor
-            CLIP text embedding mask, (B, L_clip)
-        encoder_hidden_states_t5: torch.Tensor
-            T5 text embedding, (B, L_t5, D)
-        text_embedding_mask_t5: torch.Tensor
-            T5 text embedding mask, (B, L_t5)
-        image_meta_size: torch.Tensor
-            (B, 6)
-        style: torch.Tensor
-            (B)
-        cos_cis_img: torch.Tensor
-        sin_cis_img: torch.Tensor
-        return_dict: bool
-            Whether to return a dictionary.
-        """
-        condition = hint
-        if condition.shape[0] == 1:
-            condition = torch.repeat_interleave(condition, x.shape[0], dim=0)
-
-        text_states = context  # 2,77,1024
-        text_states_t5 = encoder_hidden_states_t5  # 2,256,2048
-        text_states_mask = text_embedding_mask.bool()  # 2,77
-        text_states_t5_mask = text_embedding_mask_t5.bool()  # 2,256
-        b_t5, l_t5, c_t5 = text_states_t5.shape
-        text_states_t5 = self.mlp_t5(text_states_t5.view(-1, c_t5)).view(b_t5, l_t5, -1)
-
-        padding = comfy.ops.cast_to_input(self.text_embedding_padding, text_states)
-
-        text_states[:, -self.text_len :] = torch.where(
-            text_states_mask[:, -self.text_len :].unsqueeze(2),
-            text_states[:, -self.text_len :],
-            padding[: self.text_len],
-        )
-        text_states_t5[:, -self.text_len_t5 :] = torch.where(
-            text_states_t5_mask[:, -self.text_len_t5 :].unsqueeze(2),
-            text_states_t5[:, -self.text_len_t5 :],
-            padding[self.text_len :],
-        )
-
-        text_states = torch.cat([text_states, text_states_t5], dim=1)  # 2,205，1024
-
-        # _, _, oh, ow = x.shape
-        # th, tw = oh // self.patch_size, ow // self.patch_size
-
-        # Get image RoPE embedding according to `reso`lution.
-        freqs_cis_img = calc_rope(
-            x, self.patch_size, self.hidden_size // self.num_heads
-        )  # (cos_cis_img, sin_cis_img)
-
-        # ========================= Build time and image embedding =========================
-        t = self.t_embedder(timesteps, dtype=self.dtype)
-        x = self.x_embedder(x)
-
-        # ========================= Concatenate all extra vectors =========================
-        # Build text tokens with pooling
-        extra_vec = self.pooler(encoder_hidden_states_t5)
-
-        # Build image meta size tokens if applicable
-        # if image_meta_size is not None:
-        #     image_meta_size = timestep_embedding(image_meta_size.view(-1), 256)   # [B * 6, 256]
-        #     if image_meta_size.dtype != self.dtype:
-        #         image_meta_size = image_meta_size.half()
-        #     image_meta_size = image_meta_size.view(-1, 6 * 256)
-        #     extra_vec = torch.cat([extra_vec, image_meta_size], dim=1)  # [B, D + 6 * 256]
-
-        # Build style tokens
-        if style is not None:
-            style_embedding = self.style_embedder(style)
-            extra_vec = torch.cat([extra_vec, style_embedding], dim=1)
-
-        # Concatenate all extra vectors
-        c = t + self.extra_embedder(extra_vec)  # [B, D]
-
-        # ========================= Deal with Condition =========================
-        condition = self.x_embedder(condition)
-
-        # ========================= Forward pass through HunYuanDiT blocks =========================
-        controls = []
-        x = x + self.before_proj(condition)  # add condition
-        for layer, block in enumerate(self.blocks):
-            x = block(x, c, text_states, freqs_cis_img)
-            controls.append(self.after_proj_list[layer](x))  # zero linear for output
-
-        return {"output": controls}
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@@ -1,410 +0,0 @@
-from typing import Any
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import comfy.ops
-from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
-from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
-from torch.utils import checkpoint
-
-from .attn_layers import Attention, CrossAttention
-from .poolers import AttentionPool
-from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop
-
-def calc_rope(x, patch_size, head_size):
-    th = (x.shape[2] + (patch_size // 2)) // patch_size
-    tw = (x.shape[3] + (patch_size // 2)) // patch_size
-    base_size = 512 // 8 // patch_size
-    start, stop = get_fill_resize_and_crop((th, tw), base_size)
-    sub_args = [start, stop, (th, tw)]
-    # head_size = HUNYUAN_DIT_CONFIG['DiT-g/2']['hidden_size'] // HUNYUAN_DIT_CONFIG['DiT-g/2']['num_heads']
-    rope = get_2d_rotary_pos_embed(head_size, *sub_args)
-    rope = (rope[0].to(x), rope[1].to(x))
-    return rope
-
-
-def modulate(x, shift, scale):
-    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-
-
-class HunYuanDiTBlock(nn.Module):
-    """
-    A HunYuanDiT block with `add` conditioning.
-    """
-    def __init__(self,
-                 hidden_size,
-                 c_emb_size,
-                 num_heads,
-                 mlp_ratio=4.0,
-                 text_states_dim=1024,
-                 qk_norm=False,
-                 norm_type="layer",
-                 skip=False,
-                 attn_precision=None,
-                 dtype=None,
-                 device=None,
-                 operations=None,
-                 ):
-        super().__init__()
-        use_ele_affine = True
-
-        if norm_type == "layer":
-            norm_layer = operations.LayerNorm
-        elif norm_type == "rms":
-            norm_layer = RMSNorm
-        else:
-            raise ValueError(f"Unknown norm_type: {norm_type}")
-
-        # ========================= Self-Attention =========================
-        self.norm1 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6, dtype=dtype, device=device)
-        self.attn1 = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=qk_norm, attn_precision=attn_precision, dtype=dtype, device=device, operations=operations)
-
-        # ========================= FFN =========================
-        self.norm2 = norm_layer(hidden_size, elementwise_affine=use_ele_affine, eps=1e-6, dtype=dtype, device=device)
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        approx_gelu = lambda: nn.GELU(approximate="tanh")
-        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0, dtype=dtype, device=device, operations=operations)
-
-        # ========================= Add =========================
-        # Simply use add like SDXL.
-        self.default_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(c_emb_size, hidden_size, bias=True, dtype=dtype, device=device)
-        )
-
-        # ========================= Cross-Attention =========================
-        self.attn2 = CrossAttention(hidden_size, text_states_dim, num_heads=num_heads, qkv_bias=True,
-                                        qk_norm=qk_norm, attn_precision=attn_precision, dtype=dtype, device=device, operations=operations)
-        self.norm3 = norm_layer(hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
-
-        # ========================= Skip Connection =========================
-        if skip:
-            self.skip_norm = norm_layer(2 * hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
-            self.skip_linear = operations.Linear(2 * hidden_size, hidden_size, dtype=dtype, device=device)
-        else:
-            self.skip_linear = None
-
-        self.gradient_checkpointing = False
-
-    def _forward(self, x, c=None, text_states=None, freq_cis_img=None, skip=None):
-        # Long Skip Connection
-        if self.skip_linear is not None:
-            cat = torch.cat([x, skip], dim=-1)
-            if cat.dtype != x.dtype:
-                cat = cat.to(x.dtype)
-            cat = self.skip_norm(cat)
-            x = self.skip_linear(cat)
-
-        # Self-Attention
-        shift_msa = self.default_modulation(c).unsqueeze(dim=1)
-        attn_inputs = (
-            self.norm1(x) + shift_msa, freq_cis_img,
-        )
-        x = x + self.attn1(*attn_inputs)[0]
-
-        # Cross-Attention
-        cross_inputs = (
-            self.norm3(x), text_states, freq_cis_img
-        )
-        x = x + self.attn2(*cross_inputs)[0]
-
-        # FFN Layer
-        mlp_inputs = self.norm2(x)
-        x = x + self.mlp(mlp_inputs)
-
-        return x
-
-    def forward(self, x, c=None, text_states=None, freq_cis_img=None, skip=None):
-        if self.gradient_checkpointing and self.training:
-            return checkpoint.checkpoint(self._forward, x, c, text_states, freq_cis_img, skip)
-        return self._forward(x, c, text_states, freq_cis_img, skip)
-
-
-class FinalLayer(nn.Module):
-    """
-    The final layer of HunYuanDiT.
-    """
-    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm_final = operations.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(c_emb_size, 2 * final_hidden_size, bias=True, dtype=dtype, device=device)
-        )
-
-    def forward(self, x, c):
-        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
-        x = modulate(self.norm_final(x), shift, scale)
-        x = self.linear(x)
-        return x
-
-
-class HunYuanDiT(nn.Module):
-    """
-    HunYuanDiT: Diffusion model with a Transformer backbone.
-
-    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
-
-    Inherit PeftAdapterMixin to be compatible with the PEFT training pipeline.
-
-    Parameters
-    ----------
-    args: argparse.Namespace
-        The arguments parsed by argparse.
-    input_size: tuple
-        The size of the input image.
-    patch_size: int
-        The size of the patch.
-    in_channels: int
-        The number of input channels.
-    hidden_size: int
-        The hidden size of the transformer backbone.
-    depth: int
-        The number of transformer blocks.
-    num_heads: int
-        The number of attention heads.
-    mlp_ratio: float
-        The ratio of the hidden size of the MLP in the transformer block.
-    log_fn: callable
-        The logging function.
-    """
-    #@register_to_config
-    def __init__(self,
-                 input_size: tuple = 32,
-                 patch_size: int = 2,
-                 in_channels: int = 4,
-                 hidden_size: int = 1152,
-                 depth: int = 28,
-                 num_heads: int = 16,
-                 mlp_ratio: float = 4.0,
-                 text_states_dim = 1024,
-                 text_states_dim_t5 = 2048,
-                 text_len = 77,
-                 text_len_t5 = 256,
-                 qk_norm = True,# See http://arxiv.org/abs/2302.05442 for details.
-                 size_cond = False,
-                 use_style_cond = False,
-                 learn_sigma = True,
-                 norm = "layer",
-                 log_fn: callable = print,
-                 attn_precision=None,
-                 dtype=None,
-                 device=None,
-                 operations=None,
-                 **kwargs,
-    ):
-        super().__init__()
-        self.log_fn = log_fn
-        self.depth = depth
-        self.learn_sigma = learn_sigma
-        self.in_channels = in_channels
-        self.out_channels = in_channels * 2 if learn_sigma else in_channels
-        self.patch_size = patch_size
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.text_states_dim = text_states_dim
-        self.text_states_dim_t5 = text_states_dim_t5
-        self.text_len = text_len
-        self.text_len_t5 = text_len_t5
-        self.size_cond = size_cond
-        self.use_style_cond = use_style_cond
-        self.norm = norm
-        self.dtype = dtype
-        #import pdb
-        #pdb.set_trace()
-
-        self.mlp_t5 = nn.Sequential(
-            operations.Linear(self.text_states_dim_t5, self.text_states_dim_t5 * 4, bias=True, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Linear(self.text_states_dim_t5 * 4, self.text_states_dim, bias=True, dtype=dtype, device=device),
-        )
-        # learnable replace
-        self.text_embedding_padding = nn.Parameter(
-            torch.empty(self.text_len + self.text_len_t5, self.text_states_dim, dtype=dtype, device=device))
-
-        # Attention pooling
-        pooler_out_dim = 1024
-        self.pooler = AttentionPool(self.text_len_t5, self.text_states_dim_t5, num_heads=8, output_dim=pooler_out_dim, dtype=dtype, device=device, operations=operations)
-
-        # Dimension of the extra input vectors
-        self.extra_in_dim = pooler_out_dim
-
-        if self.size_cond:
-            # Image size and crop size conditions
-            self.extra_in_dim += 6 * 256
-
-        if self.use_style_cond:
-            # Here we use a default learned embedder layer for future extension.
-            self.style_embedder = operations.Embedding(1, hidden_size, dtype=dtype, device=device)
-            self.extra_in_dim += hidden_size
-
-        # Text embedding for `add`
-        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, dtype=dtype, device=device, operations=operations)
-        self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype, device=device, operations=operations)
-        self.extra_embedder = nn.Sequential(
-            operations.Linear(self.extra_in_dim, hidden_size * 4, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Linear(hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-        # Image embedding
-        num_patches = self.x_embedder.num_patches
-
-        # HUnYuanDiT Blocks
-        self.blocks = nn.ModuleList([
-            HunYuanDiTBlock(hidden_size=hidden_size,
-                            c_emb_size=hidden_size,
-                            num_heads=num_heads,
-                            mlp_ratio=mlp_ratio,
-                            text_states_dim=self.text_states_dim,
-                            qk_norm=qk_norm,
-                            norm_type=self.norm,
-                            skip=layer > depth // 2,
-                            attn_precision=attn_precision,
-                            dtype=dtype,
-                            device=device,
-                            operations=operations,
-                            )
-            for layer in range(depth)
-        ])
-
-        self.final_layer = FinalLayer(hidden_size, hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
-        self.unpatchify_channels = self.out_channels
-
-
-
-    def forward(self,
-                x,
-                t,
-                context,#encoder_hidden_states=None,
-                text_embedding_mask=None,
-                encoder_hidden_states_t5=None,
-                text_embedding_mask_t5=None,
-                image_meta_size=None,
-                style=None,
-                return_dict=False,
-                control=None,
-                transformer_options=None,
-                ):
-        """
-        Forward pass of the encoder.
-
-        Parameters
-        ----------
-        x: torch.Tensor
-            (B, D, H, W)
-        t: torch.Tensor
-            (B)
-        encoder_hidden_states: torch.Tensor
-            CLIP text embedding, (B, L_clip, D)
-        text_embedding_mask: torch.Tensor
-            CLIP text embedding mask, (B, L_clip)
-        encoder_hidden_states_t5: torch.Tensor
-            T5 text embedding, (B, L_t5, D)
-        text_embedding_mask_t5: torch.Tensor
-            T5 text embedding mask, (B, L_t5)
-        image_meta_size: torch.Tensor
-            (B, 6)
-        style: torch.Tensor
-            (B)
-        cos_cis_img: torch.Tensor
-        sin_cis_img: torch.Tensor
-        return_dict: bool
-            Whether to return a dictionary.
-        """
-        #import pdb
-        #pdb.set_trace()
-        encoder_hidden_states = context
-        text_states = encoder_hidden_states                     # 2,77,1024
-        text_states_t5 = encoder_hidden_states_t5               # 2,256,2048
-        text_states_mask = text_embedding_mask.bool()           # 2,77
-        text_states_t5_mask = text_embedding_mask_t5.bool()     # 2,256
-        b_t5, l_t5, c_t5 = text_states_t5.shape
-        text_states_t5 = self.mlp_t5(text_states_t5.view(-1, c_t5)).view(b_t5, l_t5, -1)
-
-        padding = comfy.ops.cast_to_input(self.text_embedding_padding, text_states)
-
-        text_states[:,-self.text_len:] = torch.where(text_states_mask[:,-self.text_len:].unsqueeze(2), text_states[:,-self.text_len:], padding[:self.text_len])
-        text_states_t5[:,-self.text_len_t5:] = torch.where(text_states_t5_mask[:,-self.text_len_t5:].unsqueeze(2), text_states_t5[:,-self.text_len_t5:], padding[self.text_len:])
-
-        text_states = torch.cat([text_states, text_states_t5], dim=1)  # 2,205，1024
-        # clip_t5_mask = torch.cat([text_states_mask, text_states_t5_mask], dim=-1)
-
-        _, _, oh, ow = x.shape
-        th, tw = (oh + (self.patch_size // 2)) // self.patch_size, (ow + (self.patch_size // 2)) // self.patch_size
-
-
-        # Get image RoPE embedding according to `reso`lution.
-        freqs_cis_img = calc_rope(x, self.patch_size, self.hidden_size // self.num_heads) #(cos_cis_img, sin_cis_img)
-
-        # ========================= Build time and image embedding =========================
-        t = self.t_embedder(t, dtype=x.dtype)
-        x = self.x_embedder(x)
-
-        # ========================= Concatenate all extra vectors =========================
-        # Build text tokens with pooling
-        extra_vec = self.pooler(encoder_hidden_states_t5)
-
-        # Build image meta size tokens if applicable
-        if self.size_cond:
-            image_meta_size = timestep_embedding(image_meta_size.view(-1), 256).to(x.dtype)   # [B * 6, 256]
-            image_meta_size = image_meta_size.view(-1, 6 * 256)
-            extra_vec = torch.cat([extra_vec, image_meta_size], dim=1)  # [B, D + 6 * 256]
-
-        # Build style tokens
-        if self.use_style_cond:
-            if style is None:
-                style = torch.zeros((extra_vec.shape[0],), device=x.device, dtype=torch.int)
-            style_embedding = self.style_embedder(style, out_dtype=x.dtype)
-            extra_vec = torch.cat([extra_vec, style_embedding], dim=1)
-
-        # Concatenate all extra vectors
-        c = t + self.extra_embedder(extra_vec)  # [B, D]
-
-        controls = None
-        if control:
-            controls = control.get("output", None)
-        # ========================= Forward pass through HunYuanDiT blocks =========================
-        skips = []
-        for layer, block in enumerate(self.blocks):
-            if layer > self.depth // 2:
-                if controls is not None:
-                    skip = skips.pop() + controls.pop()
-                else:
-                    skip = skips.pop()
-                x = block(x, c, text_states, freqs_cis_img, skip)   # (N, L, D)
-            else:
-                x = block(x, c, text_states, freqs_cis_img)         # (N, L, D)
-
-            if layer < (self.depth // 2 - 1):
-                skips.append(x)
-        if controls is not None and len(controls) != 0:
-            raise ValueError("The number of controls is not equal to the number of skip connections.")
-
-        # ========================= Final layer =========================
-        x = self.final_layer(x, c)                              # (N, L, patch_size ** 2 * out_channels)
-        x = self.unpatchify(x, th, tw)                          # (N, out_channels, H, W)
-
-        if return_dict:
-            return {'x': x}
-        if self.learn_sigma:
-            return x[:,:self.out_channels // 2,:oh,:ow]
-        return x[:,:,:oh,:ow]
-
-    def unpatchify(self, x, h, w):
-        """
-        x: (N, T, patch_size**2 * C)
-        imgs: (N, H, W, C)
-        """
-        c = self.unpatchify_channels
-        p = self.x_embedder.patch_size[0]
-        # h = w = int(x.shape[1] ** 0.5)
-        assert h * w == x.shape[1]
-
-        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
-        x = torch.einsum('nhwpqc->nchpwq', x)
-        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
-        return imgs
--- a/comfy/ldm/hydit/poolers.py
+++ b/comfy/ldm/hydit/poolers.py
@@ -1,37 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from comfy.ldm.modules.attention import optimized_attention
-import comfy.ops
-
-class AttentionPool(nn.Module):
-    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.positional_embedding = nn.Parameter(torch.empty(spacial_dim + 1, embed_dim, dtype=dtype, device=device))
-        self.k_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
-        self.q_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
-        self.v_proj = operations.Linear(embed_dim, embed_dim, dtype=dtype, device=device)
-        self.c_proj = operations.Linear(embed_dim, output_dim or embed_dim, dtype=dtype, device=device)
-        self.num_heads = num_heads
-        self.embed_dim = embed_dim
-
-    def forward(self, x):
-        x = x[:,:self.positional_embedding.shape[0] - 1]
-        x = x.permute(1, 0, 2)  # NLC -> LNC
-        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
-        x = x + comfy.ops.cast_to_input(self.positional_embedding[:, None, :], x) # (L+1)NC
-
-        q = self.q_proj(x[:1])
-        k = self.k_proj(x)
-        v = self.v_proj(x)
-
-        batch_size = q.shape[1]
-        head_dim = self.embed_dim // self.num_heads
-        q = q.view(1, batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
-        k = k.view(k.shape[0], batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
-        v = v.view(v.shape[0], batch_size * self.num_heads, head_dim).transpose(0, 1).view(batch_size, self.num_heads, -1, head_dim)
-
-        attn_output = optimized_attention(q, k, v, self.num_heads, skip_reshape=True).transpose(0, 1)
-
-        attn_output = self.c_proj(attn_output)
-        return attn_output.squeeze(0)
--- a/comfy/ldm/hydit/posemb_layers.py
+++ b/comfy/ldm/hydit/posemb_layers.py
@@ -1,224 +0,0 @@
-import torch
-import numpy as np
-from typing import Union
-
-
-def _to_tuple(x):
-    if isinstance(x, int):
-        return x, x
-    else:
-        return x
-
-
-def get_fill_resize_and_crop(src, tgt):
-    th, tw = _to_tuple(tgt)
-    h, w = _to_tuple(src)
-
-    tr = th / tw        # base resolution
-    r = h / w           # target resolution
-
-    # resize
-    if r > tr:
-        resize_height = th
-        resize_width = int(round(th / h * w))
-    else:
-        resize_width = tw
-        resize_height = int(round(tw / w * h))    # resize the target resolution down based on the base resolution
-
-    crop_top = int(round((th - resize_height) / 2.0))
-    crop_left = int(round((tw - resize_width) / 2.0))
-
-    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
-
-
-def get_meshgrid(start, *args):
-    if len(args) == 0:
-        # start is grid_size
-        num = _to_tuple(start)
-        start = (0, 0)
-        stop = num
-    elif len(args) == 1:
-        # start is start, args[0] is stop, step is 1
-        start = _to_tuple(start)
-        stop = _to_tuple(args[0])
-        num = (stop[0] - start[0], stop[1] - start[1])
-    elif len(args) == 2:
-        # start is start, args[0] is stop, args[1] is num
-        start = _to_tuple(start)
-        stop = _to_tuple(args[0])
-        num = _to_tuple(args[1])
-    else:
-        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
-
-    grid_h = np.linspace(start[0], stop[0], num[0], endpoint=False, dtype=np.float32)
-    grid_w = np.linspace(start[1], stop[1], num[1], endpoint=False, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)   # [2, W, H]
-    return grid
-
-#################################################################################
-#                   Sine/Cosine Positional Embedding Functions                  #
-#################################################################################
-# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
-
-def get_2d_sincos_pos_embed(embed_dim, start, *args, cls_token=False, extra_tokens=0):
-    """
-    grid_size: int of the grid height and width
-    return:
-    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    grid = get_meshgrid(start, *args)   # [2, H, w]
-    # grid_h = np.arange(grid_size, dtype=np.float32)
-    # grid_w = np.arange(grid_size, dtype=np.float32)
-    # grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    # grid = np.stack(grid, axis=0)   # [2, W, H]
-
-    grid = grid.reshape([2, 1, *grid.shape[1:]])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if cls_token and extra_tokens > 0:
-        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    assert embed_dim % 2 == 0
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=1)    # (H*W, D)
-    return emb
-
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (W,H)
-    out: (M, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000**omega  # (D/2,)
-
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-
-    emb_sin = np.sin(out)   # (M, D/2)
-    emb_cos = np.cos(out)   # (M, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-
-
-#################################################################################
-#                   Rotary Positional Embedding Functions                       #
-#################################################################################
-# https://github.com/facebookresearch/llama/blob/main/llama/model.py#L443
-
-def get_2d_rotary_pos_embed(embed_dim, start, *args, use_real=True):
-    """
-    This is a 2d version of precompute_freqs_cis, which is a RoPE for image tokens with 2d structure.
-
-    Parameters
-    ----------
-    embed_dim: int
-        embedding dimension size
-    start: int or tuple of int
-        If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop, step is 1;
-        If len(args) == 2, start is start, args[0] is stop, args[1] is num.
-    use_real: bool
-        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
-
-    Returns
-    -------
-    pos_embed: torch.Tensor
-        [HW, D/2]
-    """
-    grid = get_meshgrid(start, *args)   # [2, H, w]
-    grid = grid.reshape([2, 1, *grid.shape[1:]])   # Returns a sampling matrix with the same resolution as the target resolution
-    pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)
-    return pos_embed
-
-
-def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
-    assert embed_dim % 4 == 0
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_rotary_pos_embed(embed_dim // 2, grid[0].reshape(-1), use_real=use_real)  # (H*W, D/4)
-    emb_w = get_1d_rotary_pos_embed(embed_dim // 2, grid[1].reshape(-1), use_real=use_real)  # (H*W, D/4)
-
-    if use_real:
-        cos = torch.cat([emb_h[0], emb_w[0]], dim=1)    # (H*W, D/2)
-        sin = torch.cat([emb_h[1], emb_w[1]], dim=1)    # (H*W, D/2)
-        return cos, sin
-    else:
-        emb = torch.cat([emb_h, emb_w], dim=1)    # (H*W, D/2)
-        return emb
-
-
-def get_1d_rotary_pos_embed(dim: int, pos: Union[np.ndarray, int], theta: float = 10000.0, use_real=False):
-    """
-    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
-
-    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
-    and the end index 'end'. The 'theta' parameter scales the frequencies.
-    The returned tensor contains complex values in complex64 data type.
-
-    Args:
-        dim (int): Dimension of the frequency tensor.
-        pos (np.ndarray, int): Position indices for the frequency tensor. [S] or scalar
-        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
-        use_real (bool, optional): If True, return real part and imaginary part separately.
-                                   Otherwise, return complex numbers.
-
-    Returns:
-        torch.Tensor: Precomputed frequency tensor with complex exponentials. [S, D/2]
-
-    """
-    if isinstance(pos, int):
-        pos = np.arange(pos)
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [D/2]
-    t = torch.from_numpy(pos).to(freqs.device)  # type: ignore  # [S]
-    freqs = torch.outer(t, freqs).float()  # type: ignore   # [S, D/2]
-    if use_real:
-        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
-        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
-        return freqs_cos, freqs_sin
-    else:
-        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
-        return freqs_cis
-
-
-
-def calc_sizes(rope_img, patch_size, th, tw):
-    if rope_img == 'extend':
-        # Expansion mode
-        sub_args = [(th, tw)]
-    elif rope_img.startswith('base'):
-        # Based on the specified dimensions, other dimensions are obtained through interpolation.
-        base_size = int(rope_img[4:]) // 8 // patch_size
-        start, stop = get_fill_resize_and_crop((th, tw), base_size)
-        sub_args = [start, stop, (th, tw)]
-    else:
-        raise ValueError(f"Unknown rope_img: {rope_img}")
-    return sub_args
-
-
-def init_image_posemb(rope_img,
-                      resolutions,
-                      patch_size,
-                      hidden_size,
-                      num_heads,
-                      log_fn,
-                      rope_real=True,
-                      ):
-    freqs_cis_img = {}
-    for reso in resolutions:
-        th, tw = reso.height // 8 // patch_size, reso.width // 8 // patch_size
-        sub_args = calc_sizes(rope_img, patch_size, th, tw)
-        freqs_cis_img[str(reso)] = get_2d_rotary_pos_embed(hidden_size // num_heads, *sub_args, use_real=rope_real)
-        log_fn(f"    Using image RoPE ({rope_img}) ({'real' if rope_real else 'complex'}): {sub_args} | ({reso}) "
-               f"{freqs_cis_img[str(reso)][0].shape if rope_real else freqs_cis_img[str(reso)].shape}")
-    return freqs_cis_img
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -358,7 +358,7 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
            disabled_xformers = True

    if disabled_xformers:
-        return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape)
+        return attention_pytorch(q, k, v, heads, mask)

    if skip_reshape:
         q, k, v = map(
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -8,8 +8,6 @@ import torch.nn as nn
 from .. import attention
 from einops import rearrange, repeat
 from .util import timestep_embedding
-import comfy.ops
-import comfy.ldm.common_dit

 def default(x, y):
    if x is not None:
@@ -71,14 +69,12 @@ class PatchEmbed(nn.Module):
            bias: bool = True,
            strict_img_size: bool = True,
            dynamic_img_pad: bool = True,
-            padding_mode='circular',
            dtype=None,
            device=None,
            operations=None,
    ):
        super().__init__()
        self.patch_size = (patch_size, patch_size)
-        self.padding_mode = padding_mode
        if img_size is not None:
            self.img_size = (img_size, img_size)
            self.grid_size = tuple([s // p for s, p in zip(self.img_size, self.patch_size)])
@@ -112,7 +108,9 @@ class PatchEmbed(nn.Module):
        #             f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
        #         )
        if self.dynamic_img_pad:
-            x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size, padding_mode=self.padding_mode)
+            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
+            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
+            x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
@@ -926,7 +924,7 @@ class MMDiT(nn.Module):
            context = self.context_processor(context)

        hw = x.shape[-2:]
-        x = self.x_embedder(x) + comfy.ops.cast_to_input(self.cropped_pos_embed(hw, device=x.device), x)
+        x = self.x_embedder(x) + self.cropped_pos_embed(hw, device=x.device).to(dtype=x.dtype, device=x.device)
        c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
        if y is not None and self.y_embedder is not None:
            y = self.y_embedder(y)  # (N, D)
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@@ -809,7 +809,7 @@ class UNetModel(nn.Module):
        self.out = nn.Sequential(
            operations.GroupNorm(32, ch, dtype=self.dtype, device=device),
            nn.SiLU(),
-            operations.conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype, device=device),
+            zero_module(operations.conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype, device=device)),
        )
        if self.predict_codebook_ids:
            self.id_predictor = nn.Sequential(
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -1,26 +1,5 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Comfy
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-
 import comfy.utils
-import comfy.model_management
-import comfy.model_base
 import logging
-import torch

 LORA_CLIP_MAP = {
    "mlp.fc1": "mlp_fc1",
@@ -239,17 +218,11 @@ def model_lora_keys_clip(model, key_map={}):
                    lora_key = "lora_prior_te_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #cascade lora: TODO put lora key prefix in the model config
                    key_map[lora_key] = k

-    for k in sdk:
-        if k.endswith(".weight"):
-            if k.startswith("t5xxl.transformer."):#OneTrainer SD3 lora
-                l_key = k[len("t5xxl.transformer."):-len(".weight")]
-                lora_key = "lora_te3_{}".format(l_key.replace(".", "_"))
-                key_map[lora_key] = k
-            elif k.startswith("hydit_clip.transformer.bert."): #HunyuanDiT Lora
-                l_key = k[len("hydit_clip.transformer.bert."):-len(".weight")]
-                lora_key = "lora_te1_{}".format(l_key.replace(".", "_"))
-                key_map[lora_key] = k
-
+    for k in sdk: #OneTrainer SD3 lora
+        if k.startswith("t5xxl.transformer.") and k.endswith(".weight"):
+            l_key = k[len("t5xxl.transformer."):-len(".weight")]
+            lora_key = "lora_te3_{}".format(l_key.replace(".", "_"))
+            key_map[lora_key] = k

    k = "clip_g.transformer.text_projection.weight"
    if k in sdk:
@@ -272,7 +245,6 @@ def model_lora_keys_unet(model, key_map={}):
            key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
            key_map["lora_unet_{}".format(key_lora)] = k
            key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config
-            key_map["{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names

    diffusers_keys = comfy.utils.unet_to_diffusers(model.model_config.unet_config)
    for k in diffusers_keys:
@@ -310,207 +282,4 @@ def model_lora_keys_unet(model, key_map={}):
                key_lora = "transformer.{}".format(k[:-len(".weight")]) #simpletrainer and probably regular diffusers lora format
                key_map[key_lora] = to

-    if isinstance(model, comfy.model_base.HunyuanDiT):
-        for k in sdk:
-            if k.startswith("diffusion_model.") and k.endswith(".weight"):
-                key_lora = k[len("diffusion_model."):-len(".weight")]
-                key_map["base_model.model.{}".format(key_lora)] = k #official hunyuan lora format
-
-    if isinstance(model, comfy.model_base.Flux): #Diffusers lora Flux
-        diffusers_keys = comfy.utils.flux_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
-        for k in diffusers_keys:
-            if k.endswith(".weight"):
-                to = diffusers_keys[k]
-                key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format
-                key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris
-
    return key_map
-
-
-def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype):
-    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, intermediate_dtype)
-    lora_diff *= alpha
-    weight_calc = weight + lora_diff.type(weight.dtype)
-    weight_norm = (
-        weight_calc.transpose(0, 1)
-        .reshape(weight_calc.shape[1], -1)
-        .norm(dim=1, keepdim=True)
-        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
-        .transpose(0, 1)
-    )
-
-    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
-    if strength != 1.0:
-        weight_calc -= weight
-        weight += strength * (weight_calc)
-    else:
-        weight[:] = weight_calc
-    return weight
-
-def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32):
-    for p in patches:
-        strength = p[0]
-        v = p[1]
-        strength_model = p[2]
-        offset = p[3]
-        function = p[4]
-        if function is None:
-            function = lambda a: a
-
-        old_weight = None
-        if offset is not None:
-            old_weight = weight
-            weight = weight.narrow(offset[0], offset[1], offset[2])
-
-        if strength_model != 1.0:
-            weight *= strength_model
-
-        if isinstance(v, list):
-            v = (calculate_weight(v[1:], v[0].clone(), key, intermediate_dtype=intermediate_dtype), )
-
-        if len(v) == 1:
-            patch_type = "diff"
-        elif len(v) == 2:
-            patch_type = v[0]
-            v = v[1]
-
-        if patch_type == "diff":
-            w1 = v[0]
-            if strength != 0.0:
-                if w1.shape != weight.shape:
-                    logging.warning("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, w1.shape, weight.shape))
-                else:
-                    weight += function(strength * comfy.model_management.cast_to_device(w1, weight.device, weight.dtype))
-        elif patch_type == "lora": #lora/locon
-            mat1 = comfy.model_management.cast_to_device(v[0], weight.device, intermediate_dtype)
-            mat2 = comfy.model_management.cast_to_device(v[1], weight.device, intermediate_dtype)
-            dora_scale = v[4]
-            if v[2] is not None:
-                alpha = v[2] / mat2.shape[0]
-            else:
-                alpha = 1.0
-
-            if v[3] is not None:
-                #locon mid weights, hopefully the math is fine because I didn't properly test it
-                mat3 = comfy.model_management.cast_to_device(v[3], weight.device, intermediate_dtype)
-                final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
-                mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
-            try:
-                lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
-                if dora_scale is not None:
-                    weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype))
-                else:
-                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-            except Exception as e:
-                logging.error("ERROR {} {} {}".format(patch_type, key, e))
-        elif patch_type == "lokr":
-            w1 = v[0]
-            w2 = v[1]
-            w1_a = v[3]
-            w1_b = v[4]
-            w2_a = v[5]
-            w2_b = v[6]
-            t2 = v[7]
-            dora_scale = v[8]
-            dim = None
-
-            if w1 is None:
-                dim = w1_b.shape[0]
-                w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w1_b, weight.device, intermediate_dtype))
-            else:
-                w1 = comfy.model_management.cast_to_device(w1, weight.device, intermediate_dtype)
-
-            if w2 is None:
-                dim = w2_b.shape[0]
-                if t2 is None:
-                    w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype),
-                                    comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype))
-                else:
-                    w2 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                        comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
-                                        comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype),
-                                        comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype))
-            else:
-                w2 = comfy.model_management.cast_to_device(w2, weight.device, intermediate_dtype)
-
-            if len(w2.shape) == 4:
-                w1 = w1.unsqueeze(2).unsqueeze(2)
-            if v[2] is not None and dim is not None:
-                alpha = v[2] / dim
-            else:
-                alpha = 1.0
-
-            try:
-                lora_diff = torch.kron(w1, w2).reshape(weight.shape)
-                if dora_scale is not None:
-                    weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype))
-                else:
-                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-            except Exception as e:
-                logging.error("ERROR {} {} {}".format(patch_type, key, e))
-        elif patch_type == "loha":
-            w1a = v[0]
-            w1b = v[1]
-            if v[2] is not None:
-                alpha = v[2] / w1b.shape[0]
-            else:
-                alpha = 1.0
-
-            w2a = v[3]
-            w2b = v[4]
-            dora_scale = v[7]
-            if v[5] is not None: #cp decomposition
-                t1 = v[5]
-                t2 = v[6]
-                m1 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                    comfy.model_management.cast_to_device(t1, weight.device, intermediate_dtype),
-                                    comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype),
-                                    comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype))
-
-                m2 = torch.einsum('i j k l, j r, i p -> p r k l',
-                                    comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
-                                    comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype),
-                                    comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype))
-            else:
-                m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype))
-                m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype),
-                                comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype))
-
-            try:
-                lora_diff = (m1 * m2).reshape(weight.shape)
-                if dora_scale is not None:
-                    weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype))
-                else:
-                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-            except Exception as e:
-                logging.error("ERROR {} {} {}".format(patch_type, key, e))
-        elif patch_type == "glora":
-            if v[4] is not None:
-                alpha = v[4] / v[0].shape[0]
-            else:
-                alpha = 1.0
-
-            dora_scale = v[5]
-
-            a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, intermediate_dtype)
-            a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, intermediate_dtype)
-            b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, intermediate_dtype)
-            b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, intermediate_dtype)
-
-            try:
-                lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1), a2), a1)).reshape(weight.shape)
-                if dora_scale is not None:
-                    weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype))
-                else:
-                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
-            except Exception as e:
-                logging.error("ERROR {} {} {}".format(patch_type, key, e))
-        else:
-            logging.warning("patch type not recognized {} {}".format(patch_type, key))
-
-        if old_weight is not None:
-            weight = old_weight
-
-    return weight
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1,21 +1,3 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Comfy
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-
 import torch
 import logging
 from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
@@ -25,11 +7,8 @@ from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugme
 from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
 from comfy.ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper
 import comfy.ldm.aura.mmdit
-import comfy.ldm.hydit.models
 import comfy.ldm.audio.dit
 import comfy.ldm.audio.embedders
-import comfy.ldm.flux.model
-
 import comfy.model_management
 import comfy.conds
 import comfy.ops
@@ -46,7 +25,6 @@ class ModelType(Enum):
    EDM = 5
    FLOW = 6
    V_PREDICTION_CONTINUOUS = 7
-    FLUX = 8


 from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
@@ -74,9 +52,6 @@ def model_sampling(model_config, model_type):
    elif model_type == ModelType.V_PREDICTION_CONTINUOUS:
        c = V_PREDICTION
        s = ModelSamplingContinuousV
-    elif model_type == ModelType.FLUX:
-        c = comfy.model_sampling.CONST
-        s = comfy.model_sampling.ModelSamplingFlux

    class ModelSampling(s, c):
        pass
@@ -92,18 +67,16 @@ class BaseModel(torch.nn.Module):
        self.latent_format = model_config.latent_format
        self.model_config = model_config
        self.manual_cast_dtype = model_config.manual_cast_dtype
-        self.device = device

        if not unet_config.get("disable_unet_model_creation", False):
-            if model_config.custom_operations is None:
-                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype)
+            if self.manual_cast_dtype is not None:
+                operations = comfy.ops.manual_cast
            else:
-                operations = model_config.custom_operations
+                operations = comfy.ops.disable_weight_init
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
            if comfy.model_management.force_channels_last():
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
-            logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
        self.model_type = model_type
        self.model_sampling = model_sampling(model_config, model_type)

@@ -114,7 +87,6 @@ class BaseModel(torch.nn.Module):
        self.concat_keys = ()
        logging.info("model_type {}".format(model_type.name))
        logging.debug("adm {}".format(self.adm_channels))
-        self.memory_usage_factor = model_config.memory_usage_factor

    def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        sigma = t
@@ -273,11 +245,11 @@ class BaseModel(torch.nn.Module):
                dtype = self.manual_cast_dtype
            #TODO: this needs to be tweaked
            area = input_shape[0] * math.prod(input_shape[2:])
-            return (area * comfy.model_management.dtype_size(dtype) * 0.01 * self.memory_usage_factor) * (1024 * 1024)
+            return (area * comfy.model_management.dtype_size(dtype) / 50) * (1024 * 1024)
        else:
            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
            area = input_shape[0] * math.prod(input_shape[2:])
-            return (area * 0.15 * self.memory_usage_factor) * (1024 * 1024)
+            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)


 def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
@@ -375,7 +347,6 @@ class SDXL(BaseModel):
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)

-
 class SVD_img2vid(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
        super().__init__(model_config, model_type, device=device)
@@ -616,6 +587,17 @@ class SD3(BaseModel):
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out

+    def memory_required(self, input_shape):
+        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
+            dtype = self.get_dtype()
+            if self.manual_cast_dtype is not None:
+                dtype = self.manual_cast_dtype
+            #TODO: this probably needs to be tweaked
+            area = input_shape[0] * input_shape[2] * input_shape[3]
+            return (area * comfy.model_management.dtype_size(dtype) * 0.012) * (1024 * 1024)
+        else:
+            area = input_shape[0] * input_shape[2] * input_shape[3]
+            return (area * 0.3) * (1024 * 1024)

 class AuraFlow(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
@@ -666,50 +648,3 @@ class StableAudio1(BaseModel):
            for l in s:
                sd["{}{}".format(k, l)] = s[l]
        return sd
-
-class HunyuanDiT(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hydit.models.HunYuanDiT)
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-
-        attention_mask = kwargs.get("attention_mask", None)
-        if attention_mask is not None:
-            out['text_embedding_mask'] = comfy.conds.CONDRegular(attention_mask)
-
-        conditioning_mt5xl = kwargs.get("conditioning_mt5xl", None)
-        if conditioning_mt5xl is not None:
-            out['encoder_hidden_states_t5'] = comfy.conds.CONDRegular(conditioning_mt5xl)
-
-        attention_mask_mt5xl = kwargs.get("attention_mask_mt5xl", None)
-        if attention_mask_mt5xl is not None:
-            out['text_embedding_mask_t5'] = comfy.conds.CONDRegular(attention_mask_mt5xl)
-
-        width = kwargs.get("width", 768)
-        height = kwargs.get("height", 768)
-        crop_w = kwargs.get("crop_w", 0)
-        crop_h = kwargs.get("crop_h", 0)
-        target_width = kwargs.get("target_width", width)
-        target_height = kwargs.get("target_height", height)
-
-        out['image_meta_size'] = comfy.conds.CONDRegular(torch.FloatTensor([[height, width, target_height, target_width, 0, 0]]))
-        return out
-
-class Flux(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)
-
-    def encode_adm(self, **kwargs):
-        return kwargs["pooled_output"]
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-        out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([kwargs.get("guidance", 3.5)]))
-        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -115,36 +115,6 @@ def detect_unet_config(state_dict, key_prefix):
        unet_config["n_layers"] = double_layers + single_layers
        return unet_config

-    if '{}mlp_t5.0.weight'.format(key_prefix) in state_dict_keys: #Hunyuan DiT
-        unet_config = {}
-        unet_config["image_model"] = "hydit"
-        unet_config["depth"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
-        unet_config["hidden_size"] = state_dict['{}x_embedder.proj.weight'.format(key_prefix)].shape[0]
-        if unet_config["hidden_size"] == 1408 and unet_config["depth"] == 40: #DiT-g/2
-            unet_config["mlp_ratio"] = 4.3637
-        if state_dict['{}extra_embedder.0.weight'.format(key_prefix)].shape[1] == 3968:
-            unet_config["size_cond"] = True
-            unet_config["use_style_cond"] = True
-            unet_config["image_model"] = "hydit1"
-        return unet_config
-
-    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys: #Flux
-        dit_config = {}
-        dit_config["image_model"] = "flux"
-        dit_config["in_channels"] = 16
-        dit_config["vec_in_dim"] = 768
-        dit_config["context_in_dim"] = 4096
-        dit_config["hidden_size"] = 3072
-        dit_config["mlp_ratio"] = 4.0
-        dit_config["num_heads"] = 24
-        dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
-        dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
-        dit_config["axes_dim"] = [16, 56, 56]
-        dit_config["theta"] = 10000
-        dit_config["qkv_bias"] = True
-        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
-        return dit_config
-
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None

@@ -472,15 +442,9 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
            'transformer_depth': [0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': -2, 'use_linear_in_transformer': False,
            'context_dim': 768, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 1, 1, 1, 1],
            'use_temporal_attention': False, 'use_temporal_resblock': False}
-    
-    SD15_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
-            'dtype': dtype, 'in_channels': 9, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
-            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8,
-            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
-            'use_temporal_attention': False, 'use_temporal_resblock': False}  


-    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p, SD15_diffusers_inpaint]
+    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p]

    for unet_config in supported_models:
        matches = True
@@ -501,12 +465,7 @@ def model_config_from_diffusers_unet(state_dict):
 def convert_diffusers_mmdit(state_dict, output_prefix=""):
    out_sd = {}

-    if 'transformer_blocks.0.attn.norm_added_k.weight' in state_dict: #Flux
-        depth = count_blocks(state_dict, 'transformer_blocks.{}.')
-        depth_single_blocks = count_blocks(state_dict, 'single_transformer_blocks.{}.')
-        hidden_size = state_dict["x_embedder.bias"].shape[0]
-        sd_map = comfy.utils.flux_to_diffusers({"depth": depth, "depth_single_blocks": depth_single_blocks, "hidden_size": hidden_size}, output_prefix=output_prefix)
-    elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict: #SD3
+    if 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict: #SD3
        num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.')
        depth = state_dict["pos_embed.proj.weight"].shape[0] // 64
        sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix)
@@ -532,12 +491,7 @@ def convert_diffusers_mmdit(state_dict, output_prefix=""):
                    old_weight = out_sd.get(t[0], None)
                    if old_weight is None:
                        old_weight = torch.empty_like(weight)
-                    if old_weight.shape[offset[0]] < offset[1] + offset[2]:
-                        exp = list(weight.shape)
-                        exp[offset[0]] = offset[1] + offset[2]
-                        new = torch.empty(exp, device=weight.device, dtype=weight.dtype)
-                        new[:old_weight.shape[0]] = old_weight
-                        old_weight = new
+                        old_weight = old_weight.repeat([3] + [1] * (len(old_weight.shape) - 1))

                    w = old_weight.narrow(offset[0], offset[1], offset[2])
                else:
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1,21 +1,3 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Comfy
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-
 import psutil
 import logging
 from enum import Enum
@@ -44,14 +26,9 @@ cpu_state = CPUState.GPU

 total_vram = 0

-xpu_available = False
-try:
-    torch_version = torch.version.__version__
-    xpu_available = (int(torch_version[0]) < 2 or (int(torch_version[0]) == 2 and int(torch_version[2]) <= 4)) and torch.xpu.is_available()
-except:
-    pass
-
 lowvram_available = True
+xpu_available = False
+
 if args.deterministic:
    logging.info("Using deterministic algorithms for pytorch")
    torch.use_deterministic_algorithms(True, warn_only=True)
@@ -71,10 +48,10 @@ if args.directml is not None:

 try:
    import intel_extension_for_pytorch as ipex
-    _ = torch.xpu.device_count()
-    xpu_available = torch.xpu.is_available()
+    if torch.xpu.is_available():
+        xpu_available = True
 except:
-    xpu_available = xpu_available or (hasattr(torch, "xpu") and torch.xpu.is_available())
+    pass

 try:
    if torch.backends.mps.is_available():
@@ -194,6 +171,7 @@ VAE_DTYPES = [torch.float32]

 try:
    if is_nvidia():
+        torch_version = torch.version.__version__
        if int(torch_version[0]) >= 2:
            if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                ENABLE_PYTORCH_ATTENTION = True
@@ -295,12 +273,9 @@ class LoadedModel:
    def model_memory(self):
        return self.model.model_size()

-    def model_offloaded_memory(self):
-        return self.model.model_size() - self.model.loaded_size()
-
    def model_memory_required(self, device):
-        if device == self.model.current_loaded_device():
-            return self.model_offloaded_memory()
+        if device == self.model.current_device:
+            return 0
        else:
            return self.model_memory()

@@ -312,76 +287,38 @@ class LoadedModel:

        load_weights = not self.weights_loaded

-        if self.model.loaded_size() > 0:
-            use_more_vram = lowvram_model_memory
-            if use_more_vram == 0:
-                use_more_vram = 1e32
-            self.model_use_more_vram(use_more_vram)
-        else:
-            try:
-                self.real_model = self.model.patch_model(device_to=patch_model_to, lowvram_model_memory=lowvram_model_memory, load_weights=load_weights, force_patch_weights=force_patch_weights)
-            except Exception as e:
-                self.model.unpatch_model(self.model.offload_device)
-                self.model_unload()
-                raise e
+        try:
+            if lowvram_model_memory > 0 and load_weights:
+                self.real_model = self.model.patch_model_lowvram(device_to=patch_model_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights)
+            else:
+                self.real_model = self.model.patch_model(device_to=patch_model_to, patch_weights=load_weights)
+        except Exception as e:
+            self.model.unpatch_model(self.model.offload_device)
+            self.model_unload()
+            raise e

-        if is_intel_xpu() and not args.disable_ipex_optimize and self.real_model is not None:
-            with torch.no_grad():
-                self.real_model = ipex.optimize(self.real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)
+        if is_intel_xpu() and not args.disable_ipex_optimize:
+            self.real_model = ipex.optimize(self.real_model.eval(), graph_mode=True, concat_linear=True)

        self.weights_loaded = True
        return self.real_model

    def should_reload_model(self, force_patch_weights=False):
-        if force_patch_weights and self.model.lowvram_patch_counter() > 0:
+        if force_patch_weights and self.model.lowvram_patch_counter > 0:
            return True
        return False

-    def model_unload(self, memory_to_free=None, unpatch_weights=True):
-        if memory_to_free is not None:
-            if memory_to_free < self.model.loaded_size():
-                freed = self.model.partially_unload(self.model.offload_device, memory_to_free)
-                if freed >= memory_to_free:
-                    return False
+    def model_unload(self, unpatch_weights=True):
        self.model.unpatch_model(self.model.offload_device, unpatch_weights=unpatch_weights)
        self.model.model_patches_to(self.model.offload_device)
        self.weights_loaded = self.weights_loaded and not unpatch_weights
        self.real_model = None
-        return True
-
-    def model_use_more_vram(self, extra_memory):
-        return self.model.partially_load(self.device, extra_memory)

    def __eq__(self, other):
        return self.model is other.model

-def use_more_memory(extra_memory, loaded_models, device):
-    for m in loaded_models:
-        if m.device == device:
-            extra_memory -= m.model_use_more_vram(extra_memory)
-            if extra_memory <= 0:
-                break
-
-def offloaded_memory(loaded_models, device):
-    offloaded_mem = 0
-    for m in loaded_models:
-        if m.device == device:
-            offloaded_mem += m.model_offloaded_memory()
-    return offloaded_mem
-
 def minimum_inference_memory():
-    return (1024 * 1024 * 1024) * 1.2
-
-EXTRA_RESERVED_VRAM = 200 * 1024 * 1024
-if any(platform.win32_ver()):
-    EXTRA_RESERVED_VRAM = 500 * 1024 * 1024 #Windows is higher because of the shared vram issue
-
-if args.reserve_vram is not None:
-    EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
-    logging.debug("Reserving {}MB vram for other applications.".format(EXTRA_RESERVED_VRAM / (1024 * 1024)))
-
-def extra_reserved_memory():
-    return EXTRA_RESERVED_VRAM
+    return (1024 * 1024 * 1024)

 def unload_model_clones(model, unload_weights_only=True, force_unload=True):
    to_unload = []
@@ -415,7 +352,6 @@ def unload_model_clones(model, unload_weights_only=True, force_unload=True):
 def free_memory(memory_required, device, keep_loaded=[]):
    unloaded_model = []
    can_unload = []
-    unloaded_models = []

    for i in range(len(current_loaded_models) -1, -1, -1):
        shift_model = current_loaded_models[i]
@@ -426,18 +362,14 @@ def free_memory(memory_required, device, keep_loaded=[]):

    for x in sorted(can_unload):
        i = x[-1]
-        memory_to_free = None
        if not DISABLE_SMART_MEMORY:
-            free_mem = get_free_memory(device)
-            if free_mem > memory_required:
+            if get_free_memory(device) > memory_required:
                break
-            memory_to_free = memory_required - free_mem
-        logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
-        if current_loaded_models[i].model_unload(memory_to_free):
-            unloaded_model.append(i)
+        current_loaded_models[i].model_unload()
+        unloaded_model.append(i)

    for i in sorted(unloaded_model, reverse=True):
-        unloaded_models.append(current_loaded_models.pop(i))
+        current_loaded_models.pop(i)

    if len(unloaded_model) > 0:
        soft_empty_cache()
@@ -446,17 +378,12 @@ def free_memory(memory_required, device, keep_loaded=[]):
            mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
            if mem_free_torch > mem_free_total * 0.25:
                soft_empty_cache()
-    return unloaded_models

-def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+def load_models_gpu(models, memory_required=0, force_patch_weights=False):
    global vram_state

    inference_memory = minimum_inference_memory()
-    extra_mem = max(inference_memory, memory_required + extra_reserved_memory())
-    if minimum_memory_required is None:
-        minimum_memory_required = extra_mem
-    else:
-        minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory())
+    extra_mem = max(inference_memory, memory_required)

    models = set(models)

@@ -489,36 +416,25 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
        devs = set(map(lambda a: a.device, models_already_loaded))
        for d in devs:
            if d != torch.device("cpu"):
-                free_memory(extra_mem + offloaded_memory(models_already_loaded, d), d, models_already_loaded)
-                free_mem = get_free_memory(d)
-                if free_mem < minimum_memory_required:
-                    logging.info("Unloading models for lowram load.") #TODO: partial model unloading when this case happens, also handle the opposite case where models can be unlowvramed.
-                    models_to_load = free_memory(minimum_memory_required, d)
-                    logging.info("{} models unloaded.".format(len(models_to_load)))
-                else:
-                    use_more_memory(free_mem - minimum_memory_required, models_already_loaded, d)
-        if len(models_to_load) == 0:
-            return
+                free_memory(extra_mem, d, models_already_loaded)
+        return

    logging.info(f"Loading {len(models_to_load)} new model{'s' if len(models_to_load) > 1 else ''}")

    total_memory_required = {}
    for loaded_model in models_to_load:
-        unload_model_clones(loaded_model.model, unload_weights_only=True, force_unload=False) #unload clones where the weights are different
-        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
+        if unload_model_clones(loaded_model.model, unload_weights_only=True, force_unload=False) == True:#unload clones where the weights are different
+            total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)

-    for loaded_model in models_already_loaded:
-        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
+    for device in total_memory_required:
+        if device != torch.device("cpu"):
+            free_memory(total_memory_required[device] * 1.3 + extra_mem, device, models_already_loaded)

    for loaded_model in models_to_load:
        weights_unloaded = unload_model_clones(loaded_model.model, unload_weights_only=False, force_unload=False) #unload the rest of the clones where the weights can stay loaded
        if weights_unloaded is not None:
            loaded_model.weights_loaded = not weights_unloaded

-    for device in total_memory_required:
-        if device != torch.device("cpu"):
-            free_memory(total_memory_required[device] * 1.1 + extra_mem, device, models_already_loaded)
-
    for loaded_model in models_to_load:
        model = loaded_model.model
        torch_dev = model.load_device
@@ -527,11 +443,11 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
        else:
            vram_set_state = vram_state
        lowvram_model_memory = 0
-        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM) and not force_full_load:
+        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM):
            model_size = loaded_model.model_memory_required(torch_dev)
            current_free_mem = get_free_memory(torch_dev)
-            lowvram_model_memory = max(64 * (1024 * 1024), (current_free_mem - minimum_memory_required), min(current_free_mem * 0.4, current_free_mem - minimum_inference_memory()))
-            if model_size <= lowvram_model_memory: #only switch to lowvram if really necessary
+            lowvram_model_memory = int(max(64 * (1024 * 1024), (current_free_mem - 1024 * (1024 * 1024)) / 1.3 ))
+            if model_size <= (current_free_mem - inference_memory): #only switch to lowvram if really necessary
                lowvram_model_memory = 0

        if vram_set_state == VRAMState.NO_VRAM:
@@ -539,14 +455,6 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu

        cur_loaded_model = loaded_model.model_load(lowvram_model_memory, force_patch_weights=force_patch_weights)
        current_loaded_models.insert(0, loaded_model)
-
-
-    devs = set(map(lambda a: a.device, models_already_loaded))
-    for d in devs:
-        if d != torch.device("cpu"):
-            free_mem = get_free_memory(d)
-            if free_mem > minimum_memory_required:
-                use_more_memory(free_mem - minimum_memory_required, models_already_loaded, d)
    return


@@ -566,9 +474,7 @@ def loaded_models(only_currently_used=False):
 def cleanup_models(keep_clone_weights_loaded=False):
    to_delete = []
    for i in range(len(current_loaded_models)):
-        #TODO: very fragile function needs improvement
-        num_refs = sys.getrefcount(current_loaded_models[i].model)
-        if num_refs <= 2:
+        if sys.getrefcount(current_loaded_models[i].model) <= 2:
            if not keep_clone_weights_loaded:
                to_delete = [i] + to_delete
            #TODO: find a less fragile way to do this.
@@ -617,9 +523,6 @@ def unet_inital_load_device(parameters, dtype):
    else:
        return cpu_dev

-def maximum_vram_for_weights(device=None):
-    return (get_total_memory(device) * 0.88 - minimum_inference_memory())
-
 def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
    if args.bf16_unet:
        return torch.bfloat16
@@ -629,37 +532,12 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
        return torch.float8_e4m3fn
    if args.fp8_e5m2_unet:
        return torch.float8_e5m2
-
-    fp8_dtype = None
-    try:
-        for dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            if dtype in supported_dtypes:
-                fp8_dtype = dtype
-                break
-    except:
-        pass
-
-    if fp8_dtype is not None:
-        free_model_memory = maximum_vram_for_weights(device)
-        if model_params * 2 > free_model_memory:
-            return fp8_dtype
-
-    for dt in supported_dtypes:
-        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params):
-            if torch.float16 in supported_dtypes:
-                return torch.float16
-        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params):
-            if torch.bfloat16 in supported_dtypes:
-                return torch.bfloat16
-
-    for dt in supported_dtypes:
-        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params, manual_cast=True):
-            if torch.float16 in supported_dtypes:
-                return torch.float16
-        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params, manual_cast=True):
-            if torch.bfloat16 in supported_dtypes:
-                return torch.bfloat16
-
+    if should_use_fp16(device=device, model_params=model_params, manual_cast=True):
+        if torch.float16 in supported_dtypes:
+            return torch.float16
+    if should_use_bf16(device, model_params=model_params, manual_cast=True):
+        if torch.bfloat16 in supported_dtypes:
+            return torch.bfloat16
    return torch.float32

 # None means no manual cast
@@ -675,14 +553,13 @@ def unet_manual_cast(weight_dtype, inference_device, supported_dtypes=[torch.flo
    if bf16_supported and weight_dtype == torch.bfloat16:
        return None

-    fp16_supported = should_use_fp16(inference_device, prioritize_performance=True)
-    for dt in supported_dtypes:
-        if dt == torch.float16 and fp16_supported:
-            return torch.float16
-        if dt == torch.bfloat16 and bf16_supported:
-            return torch.bfloat16
+    if fp16_supported and torch.float16 in supported_dtypes:
+        return torch.float16

-    return torch.float32
+    elif bf16_supported and torch.bfloat16 in supported_dtypes:
+        return torch.bfloat16
+    else:
+        return torch.float32

 def text_encoder_offload_device():
    if args.gpu_only:
@@ -701,20 +578,6 @@ def text_encoder_device():
    else:
        return torch.device("cpu")

-def text_encoder_initial_device(load_device, offload_device, model_size=0):
-    if load_device == offload_device or model_size <= 1024 * 1024 * 1024:
-        return offload_device
-
-    if is_device_mps(load_device):
-        return offload_device
-
-    mem_l = get_free_memory(load_device)
-    mem_o = get_free_memory(offload_device)
-    if mem_l > (mem_o * 0.5) and model_size * 1.2 < mem_l:
-        return load_device
-    else:
-        return offload_device
-
 def text_encoder_dtype(device=None):
    if args.fp8_e4m3fn_text_enc:
        return torch.float8_e4m3fn
@@ -786,29 +649,18 @@ def supports_cast(device, dtype): #TODO
        return True
    if dtype == torch.float16:
        return True
+    if is_device_mps(device):
+        return False
    if directml_enabled: #TODO: test this
        return False
    if dtype == torch.bfloat16:
        return True
-    if is_device_mps(device):
-        return False
    if dtype == torch.float8_e4m3fn:
        return True
    if dtype == torch.float8_e5m2:
        return True
    return False

-def pick_weight_dtype(dtype, fallback_dtype, device=None):
-    if dtype is None:
-        dtype = fallback_dtype
-    elif dtype_size(dtype) > dtype_size(fallback_dtype):
-        dtype = fallback_dtype
-
-    if not supports_cast(device, dtype):
-        dtype = fallback_dtype
-
-    return dtype
-
 def device_supports_non_blocking(device):
    if is_device_mps(device):
        return False #pytorch bug? mps doesn't support non blocking
@@ -891,8 +743,7 @@ def pytorch_attention_flash_attention():
 def force_upcast_attention_dtype():
    upcast = args.force_upcast_attention
    try:
-        macos_version = tuple(int(n) for n in platform.mac_ver()[0].split("."))
-        if (14, 5) <= macos_version < (14, 7):  # black image bug on recent versions of MacOS
+        if platform.mac_ver()[0] in ['14.5']: #black image bug on OSX Sonoma 14.5
            upcast = True
    except:
        pass
@@ -988,21 +839,24 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
    if torch.version.hip:
        return True

-    props = torch.cuda.get_device_properties(device)
+    props = torch.cuda.get_device_properties("cuda")
    if props.major >= 8:
        return True

    if props.major < 6:
        return False

-    #FP16 is confirmed working on a 1080 (GP104) and on latest pytorch actually seems faster than fp32
+    fp16_works = False
+    #FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled
+    #when the model doesn't actually fit on the card
+    #TODO: actually test if GP106 and others have the same type of behavior
    nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050", "p40", "p100", "p6", "p4"]
    for x in nvidia_10_series:
        if x in props.name.lower():
-            return True
+            fp16_works = True

-    if manual_cast:
-        free_model_memory = maximum_vram_for_weights(device)
+    if fp16_works or manual_cast:
+        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
        if (not prioritize_performance) or model_params * 4 > free_model_memory:
            return True

@@ -1022,9 +876,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
        if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
            return False

-    if device is not None:
+    if device is not None: #TODO not sure about mps bf16 support
        if is_device_mps(device):
-            return True
+            return False

    if FORCE_FP32:
        return False
@@ -1032,15 +886,15 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    if directml_enabled:
        return False

-    if mps_mode():
-        return True
-
-    if cpu_mode():
+    if cpu_mode() or mps_mode():
        return False

    if is_intel_xpu():
        return True

+    if device is None:
+        device = torch.device("cuda")
+
    props = torch.cuda.get_device_properties(device)
    if props.major >= 8:
        return True
@@ -1048,22 +902,12 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    bf16_works = torch.cuda.is_bf16_supported()

    if bf16_works or manual_cast:
-        free_model_memory = maximum_vram_for_weights(device)
+        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
        if (not prioritize_performance) or model_params * 4 > free_model_memory:
            return True

    return False

-def supports_fp8_compute(device=None):
-    props = torch.cuda.get_device_properties(device)
-    if props.major >= 9:
-        return True
-    if props.major < 8:
-        return False
-    if props.minor < 9:
-        return False
-    return True
-
 def soft_empty_cache(force=False):
    global cpu_state
    if cpu_state == CPUState.MPS:
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1,36 +1,35 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Comfy
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-
 import torch
 import copy
 import inspect
 import logging
 import uuid
-import collections
-import math

 import comfy.utils
-import comfy.float
 import comfy.model_management
-import comfy.lora
 from comfy.types import UnetWrapperFunction


+def weight_decompose(dora_scale, weight, lora_diff, alpha, strength):
+    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, torch.float32)
+    lora_diff *= alpha
+    weight_calc = weight + lora_diff.type(weight.dtype)
+    weight_norm = (
+        weight_calc.transpose(0, 1)
+        .reshape(weight_calc.shape[1], -1)
+        .norm(dim=1, keepdim=True)
+        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
+        .transpose(0, 1)
+    )
+
+    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
+    if strength != 1.0:
+        weight_calc -= weight
+        weight += strength * (weight_calc)
+    else:
+        weight[:] = weight_calc
+    return weight
+
+
 def set_model_options_patch_replace(model_options, patch, name, block_name, number, transformer_index=None):
    to = model_options["transformer_options"].copy()

@@ -64,30 +63,10 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_
        model_options["disable_cfg1_optimization"] = True
    return model_options

-def wipe_lowvram_weight(m):
-    if hasattr(m, "prev_comfy_cast_weights"):
-        m.comfy_cast_weights = m.prev_comfy_cast_weights
-        del m.prev_comfy_cast_weights
-    m.weight_function = None
-    m.bias_function = None
-
-class LowVramPatch:
-    def __init__(self, key, patches):
-        self.key = key
-        self.patches = patches
-    def __call__(self, weight):
-        return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype)
-
 class ModelPatcher:
-    def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
+    def __init__(self, model, load_device, offload_device, size=0, current_device=None, weight_inplace_update=False):
        self.size = size
        self.model = model
-        if not hasattr(self.model, 'device'):
-            logging.debug("Model doesn't have a device attribute.")
-            self.model.device = offload_device
-        elif self.model.device is None:
-            self.model.device = offload_device
-
        self.patches = {}
        self.backup = {}
        self.object_patches = {}
@@ -96,32 +75,24 @@ class ModelPatcher:
        self.model_size()
        self.load_device = load_device
        self.offload_device = offload_device
+        if current_device is None:
+            self.current_device = self.offload_device
+        else:
+            self.current_device = current_device
+
        self.weight_inplace_update = weight_inplace_update
+        self.model_lowvram = False
+        self.lowvram_patch_counter = 0
        self.patches_uuid = uuid.uuid4()

-        if not hasattr(self.model, 'model_loaded_weight_memory'):
-            self.model.model_loaded_weight_memory = 0
-
-        if not hasattr(self.model, 'lowvram_patch_counter'):
-            self.model.lowvram_patch_counter = 0
-
-        if not hasattr(self.model, 'model_lowvram'):
-            self.model.model_lowvram = False
-
    def model_size(self):
        if self.size > 0:
            return self.size
        self.size = comfy.model_management.module_size(self.model)
        return self.size

-    def loaded_size(self):
-        return self.model.model_loaded_weight_memory
-
-    def lowvram_patch_counter(self):
-        return self.model.lowvram_patch_counter
-
    def clone(self):
-        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
+        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, weight_inplace_update=self.weight_inplace_update)
        n.patches = {}
        for k in self.patches:
            n.patches[k] = self.patches[k][:]
@@ -293,52 +264,67 @@ class ModelPatcher:
                    sd.pop(k)
        return sd

-    def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
+    def patch_weight_to_device(self, key, device_to=None):
        if key not in self.patches:
            return

        weight = comfy.utils.get_attr(self.model, key)

-        inplace_update = self.weight_inplace_update or inplace_update
+        inplace_update = self.weight_inplace_update

        if key not in self.backup:
-            self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
+            self.backup[key] = weight.to(device=self.offload_device, copy=inplace_update)

        if device_to is not None:
            temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
        else:
            temp_weight = weight.to(torch.float32, copy=True)
-        out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
-        out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype)
+        out_weight = self.calculate_weight(self.patches[key], temp_weight, key).to(weight.dtype)
        if inplace_update:
            comfy.utils.copy_to_param(self.model, key, out_weight)
        else:
            comfy.utils.set_attr_param(self.model, key, out_weight)

-    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
+    def patch_model(self, device_to=None, patch_weights=True):
+        for k in self.object_patches:
+            old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
+            if k not in self.object_patches_backup:
+                self.object_patches_backup[k] = old
+
+        if patch_weights:
+            model_sd = self.model_state_dict()
+            for key in self.patches:
+                if key not in model_sd:
+                    logging.warning("could not patch. key doesn't exist in model: {}".format(key))
+                    continue
+
+                self.patch_weight_to_device(key, device_to)
+
+            if device_to is not None:
+                self.model.to(device_to)
+                self.current_device = device_to
+
+        return self.model
+
+    def patch_model_lowvram(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False):
+        self.patch_model(device_to, patch_weights=False)
+
+        logging.info("loading in lowvram mode {}".format(lowvram_model_memory/(1024 * 1024)))
+        class LowVramPatch:
+            def __init__(self, key, model_patcher):
+                self.key = key
+                self.model_patcher = model_patcher
+            def __call__(self, weight):
+                return self.model_patcher.calculate_weight(self.model_patcher.patches[self.key], weight, self.key)
+
        mem_counter = 0
        patch_counter = 0
-        lowvram_counter = 0
-        loading = []
        for n, m in self.model.named_modules():
-            if hasattr(m, "comfy_cast_weights") or hasattr(m, "weight"):
-                loading.append((comfy.model_management.module_size(m), n, m))
-
-        load_completely = []
-        loading.sort(reverse=True)
-        for x in loading:
-            n = x[1]
-            m = x[2]
-            module_mem = x[0]
-
            lowvram_weight = False
-
-            if not full_load and hasattr(m, "comfy_cast_weights"):
+            if hasattr(m, "comfy_cast_weights"):
+                module_mem = comfy.model_management.module_size(m)
                if mem_counter + module_mem >= lowvram_model_memory:
                    lowvram_weight = True
-                    lowvram_counter += 1
-                    if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed
-                        continue

            weight_key = "{}.weight".format(n)
            bias_key = "{}.bias".format(n)
@@ -348,173 +334,227 @@ class ModelPatcher:
                    if force_patch_weights:
                        self.patch_weight_to_device(weight_key)
                    else:
-                        m.weight_function = LowVramPatch(weight_key, self.patches)
+                        m.weight_function = LowVramPatch(weight_key, self)
                        patch_counter += 1
                if bias_key in self.patches:
                    if force_patch_weights:
                        self.patch_weight_to_device(bias_key)
                    else:
-                        m.bias_function = LowVramPatch(bias_key, self.patches)
+                        m.bias_function = LowVramPatch(bias_key, self)
                        patch_counter += 1

                m.prev_comfy_cast_weights = m.comfy_cast_weights
                m.comfy_cast_weights = True
            else:
-                if hasattr(m, "comfy_cast_weights"):
-                    if m.comfy_cast_weights:
-                        wipe_lowvram_weight(m)
-
                if hasattr(m, "weight"):
-                    mem_counter += module_mem
-                    load_completely.append((module_mem, n, m))
+                    self.patch_weight_to_device(weight_key, device_to)
+                    self.patch_weight_to_device(bias_key, device_to)
+                    m.to(device_to)
+                    mem_counter += comfy.model_management.module_size(m)
+                    logging.debug("lowvram: loaded module regularly {} {}".format(n, m))

-        load_completely.sort(reverse=True)
-        for x in load_completely:
-            n = x[1]
-            m = x[2]
-            weight_key = "{}.weight".format(n)
-            bias_key = "{}.bias".format(n)
-            if hasattr(m, "comfy_patched_weights"):
-                if m.comfy_patched_weights == True:
-                    continue
-
-            self.patch_weight_to_device(weight_key, device_to=device_to)
-            self.patch_weight_to_device(bias_key, device_to=device_to)
-            logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
-            m.comfy_patched_weights = True
-
-        for x in load_completely:
-            x[2].to(device_to)
-
-        if lowvram_counter > 0:
-            logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter))
-            self.model.model_lowvram = True
-        else:
-            logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
-            self.model.model_lowvram = False
-            if full_load:
-                self.model.to(device_to)
-                mem_counter = self.model_size()
-
-        self.model.lowvram_patch_counter += patch_counter
-        self.model.device = device_to
-        self.model.model_loaded_weight_memory = mem_counter
-
-    def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
-        for k in self.object_patches:
-            old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
-            if k not in self.object_patches_backup:
-                self.object_patches_backup[k] = old
-
-        if lowvram_model_memory == 0:
-            full_load = True
-        else:
-            full_load = False
-
-        if load_weights:
-            self.load(device_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights, full_load=full_load)
+        self.model_lowvram = True
+        self.lowvram_patch_counter = patch_counter
        return self.model

+    def calculate_weight(self, patches, weight, key):
+        for p in patches:
+            strength = p[0]
+            v = p[1]
+            strength_model = p[2]
+            offset = p[3]
+            function = p[4]
+            if function is None:
+                function = lambda a: a
+
+            old_weight = None
+            if offset is not None:
+                old_weight = weight
+                weight = weight.narrow(offset[0], offset[1], offset[2])
+
+            if strength_model != 1.0:
+                weight *= strength_model
+
+            if isinstance(v, list):
+                v = (self.calculate_weight(v[1:], v[0].clone(), key), )
+
+            if len(v) == 1:
+                patch_type = "diff"
+            elif len(v) == 2:
+                patch_type = v[0]
+                v = v[1]
+
+            if patch_type == "diff":
+                w1 = v[0]
+                if strength != 0.0:
+                    if w1.shape != weight.shape:
+                        logging.warning("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, w1.shape, weight.shape))
+                    else:
+                        weight += function(strength * comfy.model_management.cast_to_device(w1, weight.device, weight.dtype))
+            elif patch_type == "lora": #lora/locon
+                mat1 = comfy.model_management.cast_to_device(v[0], weight.device, torch.float32)
+                mat2 = comfy.model_management.cast_to_device(v[1], weight.device, torch.float32)
+                dora_scale = v[4]
+                if v[2] is not None:
+                    alpha = v[2] / mat2.shape[0]
+                else:
+                    alpha = 1.0
+
+                if v[3] is not None:
+                    #locon mid weights, hopefully the math is fine because I didn't properly test it
+                    mat3 = comfy.model_management.cast_to_device(v[3], weight.device, torch.float32)
+                    final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
+                    mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
+                try:
+                    lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
+                    if dora_scale is not None:
+                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
+                    else:
+                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+                except Exception as e:
+                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
+            elif patch_type == "lokr":
+                w1 = v[0]
+                w2 = v[1]
+                w1_a = v[3]
+                w1_b = v[4]
+                w2_a = v[5]
+                w2_b = v[6]
+                t2 = v[7]
+                dora_scale = v[8]
+                dim = None
+
+                if w1 is None:
+                    dim = w1_b.shape[0]
+                    w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, torch.float32),
+                                  comfy.model_management.cast_to_device(w1_b, weight.device, torch.float32))
+                else:
+                    w1 = comfy.model_management.cast_to_device(w1, weight.device, torch.float32)
+
+                if w2 is None:
+                    dim = w2_b.shape[0]
+                    if t2 is None:
+                        w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, torch.float32),
+                                      comfy.model_management.cast_to_device(w2_b, weight.device, torch.float32))
+                    else:
+                        w2 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                          comfy.model_management.cast_to_device(t2, weight.device, torch.float32),
+                                          comfy.model_management.cast_to_device(w2_b, weight.device, torch.float32),
+                                          comfy.model_management.cast_to_device(w2_a, weight.device, torch.float32))
+                else:
+                    w2 = comfy.model_management.cast_to_device(w2, weight.device, torch.float32)
+
+                if len(w2.shape) == 4:
+                    w1 = w1.unsqueeze(2).unsqueeze(2)
+                if v[2] is not None and dim is not None:
+                    alpha = v[2] / dim
+                else:
+                    alpha = 1.0
+
+                try:
+                    lora_diff = torch.kron(w1, w2).reshape(weight.shape)
+                    if dora_scale is not None:
+                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
+                    else:
+                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+                except Exception as e:
+                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
+            elif patch_type == "loha":
+                w1a = v[0]
+                w1b = v[1]
+                if v[2] is not None:
+                    alpha = v[2] / w1b.shape[0]
+                else:
+                    alpha = 1.0
+
+                w2a = v[3]
+                w2b = v[4]
+                dora_scale = v[7]
+                if v[5] is not None: #cp decomposition
+                    t1 = v[5]
+                    t2 = v[6]
+                    m1 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                      comfy.model_management.cast_to_device(t1, weight.device, torch.float32),
+                                      comfy.model_management.cast_to_device(w1b, weight.device, torch.float32),
+                                      comfy.model_management.cast_to_device(w1a, weight.device, torch.float32))
+
+                    m2 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                      comfy.model_management.cast_to_device(t2, weight.device, torch.float32),
+                                      comfy.model_management.cast_to_device(w2b, weight.device, torch.float32),
+                                      comfy.model_management.cast_to_device(w2a, weight.device, torch.float32))
+                else:
+                    m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, torch.float32),
+                                  comfy.model_management.cast_to_device(w1b, weight.device, torch.float32))
+                    m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, torch.float32),
+                                  comfy.model_management.cast_to_device(w2b, weight.device, torch.float32))
+
+                try:
+                    lora_diff = (m1 * m2).reshape(weight.shape)
+                    if dora_scale is not None:
+                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
+                    else:
+                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+                except Exception as e:
+                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
+            elif patch_type == "glora":
+                if v[4] is not None:
+                    alpha = v[4] / v[0].shape[0]
+                else:
+                    alpha = 1.0
+
+                dora_scale = v[5]
+
+                a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, torch.float32)
+                a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, torch.float32)
+                b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, torch.float32)
+                b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, torch.float32)
+
+                try:
+                    lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1), a2), a1)).reshape(weight.shape)
+                    if dora_scale is not None:
+                        weight = function(weight_decompose(dora_scale, weight, lora_diff, alpha, strength))
+                    else:
+                        weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+                except Exception as e:
+                    logging.error("ERROR {} {} {}".format(patch_type, key, e))
+            else:
+                logging.warning("patch type not recognized {} {}".format(patch_type, key))
+
+            if old_weight is not None:
+                weight = old_weight
+
+        return weight
+
    def unpatch_model(self, device_to=None, unpatch_weights=True):
        if unpatch_weights:
-            if self.model.model_lowvram:
+            if self.model_lowvram:
                for m in self.model.modules():
-                    wipe_lowvram_weight(m)
+                    if hasattr(m, "prev_comfy_cast_weights"):
+                        m.comfy_cast_weights = m.prev_comfy_cast_weights
+                        del m.prev_comfy_cast_weights
+                    m.weight_function = None
+                    m.bias_function = None

-                self.model.model_lowvram = False
-                self.model.lowvram_patch_counter = 0
+                self.model_lowvram = False
+                self.lowvram_patch_counter = 0

            keys = list(self.backup.keys())

-            for k in keys:
-                bk = self.backup[k]
-                if bk.inplace_update:
-                    comfy.utils.copy_to_param(self.model, k, bk.weight)
-                else:
-                    comfy.utils.set_attr_param(self.model, k, bk.weight)
+            if self.weight_inplace_update:
+                for k in keys:
+                    comfy.utils.copy_to_param(self.model, k, self.backup[k])
+            else:
+                for k in keys:
+                    comfy.utils.set_attr_param(self.model, k, self.backup[k])

            self.backup.clear()

            if device_to is not None:
                self.model.to(device_to)
-                self.model.device = device_to
-            self.model.model_loaded_weight_memory = 0
-
-            for m in self.model.modules():
-                if hasattr(m, "comfy_patched_weights"):
-                    del m.comfy_patched_weights
+                self.current_device = device_to

        keys = list(self.object_patches_backup.keys())
        for k in keys:
            comfy.utils.set_attr(self.model, k, self.object_patches_backup[k])

        self.object_patches_backup.clear()
-
-    def partially_unload(self, device_to, memory_to_free=0):
-        memory_freed = 0
-        patch_counter = 0
-        unload_list = []
-
-        for n, m in self.model.named_modules():
-            shift_lowvram = False
-            if hasattr(m, "comfy_cast_weights"):
-                module_mem = comfy.model_management.module_size(m)
-                unload_list.append((module_mem, n, m))
-
-        unload_list.sort()
-        for unload in unload_list:
-            if memory_to_free < memory_freed:
-                break
-            module_mem = unload[0]
-            n = unload[1]
-            m = unload[2]
-            weight_key = "{}.weight".format(n)
-            bias_key = "{}.bias".format(n)
-
-            if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
-                for key in [weight_key, bias_key]:
-                    bk = self.backup.get(key, None)
-                    if bk is not None:
-                        if bk.inplace_update:
-                            comfy.utils.copy_to_param(self.model, key, bk.weight)
-                        else:
-                            comfy.utils.set_attr_param(self.model, key, bk.weight)
-                        self.backup.pop(key)
-
-                m.to(device_to)
-                if weight_key in self.patches:
-                    m.weight_function = LowVramPatch(weight_key, self.patches)
-                    patch_counter += 1
-                if bias_key in self.patches:
-                    m.bias_function = LowVramPatch(bias_key, self.patches)
-                    patch_counter += 1
-
-                m.prev_comfy_cast_weights = m.comfy_cast_weights
-                m.comfy_cast_weights = True
-                m.comfy_patched_weights = False
-                memory_freed += module_mem
-                logging.debug("freed {}".format(n))
-
-        self.model.model_lowvram = True
-        self.model.lowvram_patch_counter += patch_counter
-        self.model.model_loaded_weight_memory -= memory_freed
-        return memory_freed
-
-    def partially_load(self, device_to, extra_memory=0):
-        self.unpatch_model(unpatch_weights=False)
-        self.patch_model(load_weights=False)
-        full_load = False
-        if self.model.model_lowvram == False:
-            return 0
-        if self.model.model_loaded_weight_memory + extra_memory > self.model_size():
-            full_load = True
-        current_used = self.model.model_loaded_weight_memory
-        self.load(device_to, lowvram_model_memory=current_used + extra_memory, full_load=full_load)
-        return self.model.model_loaded_weight_memory - current_used
-
-    def current_loaded_device(self):
-        return self.model.device
-
-    def calculate_weight(self, patches, weight, key, intermediate_dtype=torch.float32):
-        print("WARNING the ModelPatcher.calculate_weight function is deprecated, please use: comfy.lora.calculate_weight instead")
-        return comfy.lora.calculate_weight(patches, weight, key, intermediate_dtype=intermediate_dtype)
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@@ -272,43 +272,3 @@ class StableCascadeSampling(ModelSamplingDiscrete):

        percent = 1.0 - percent
        return self.sigma(torch.tensor(percent))
-
-
-def flux_time_shift(mu: float, sigma: float, t):
-    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
-
-class ModelSamplingFlux(torch.nn.Module):
-    def __init__(self, model_config=None):
-        super().__init__()
-        if model_config is not None:
-            sampling_settings = model_config.sampling_settings
-        else:
-            sampling_settings = {}
-
-        self.set_parameters(shift=sampling_settings.get("shift", 1.15))
-
-    def set_parameters(self, shift=1.15, timesteps=10000):
-        self.shift = shift
-        ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps))
-        self.register_buffer('sigmas', ts)
-
-    @property
-    def sigma_min(self):
-        return self.sigmas[0]
-
-    @property
-    def sigma_max(self):
-        return self.sigmas[-1]
-
-    def timestep(self, sigma):
-        return sigma
-
-    def sigma(self, timestep):
-        return flux_time_shift(self.shift, 1.0, timestep)
-
-    def percent_to_sigma(self, percent):
-        if percent <= 0.0:
-            return 1.0
-        if percent >= 1.0:
-            return 0.0
-        return 1.0 - percent
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -18,42 +18,16 @@

 import torch
 import comfy.model_management
-from comfy.cli_args import args
-
-def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False):
-    if device is None or weight.device == device:
-        if not copy:
-            if dtype is None or weight.dtype == dtype:
-                return weight
-        return weight.to(dtype=dtype, copy=copy)
-
-    r = torch.empty_like(weight, dtype=dtype, device=device)
-    r.copy_(weight, non_blocking=non_blocking)
-    return r
-
-def cast_to_input(weight, input, non_blocking=False, copy=True):
-    return cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
-
-def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
-    if input is not None:
-        if dtype is None:
-            dtype = input.dtype
-        if bias_dtype is None:
-            bias_dtype = dtype
-        if device is None:
-            device = input.device

+def cast_bias_weight(s, input):
    bias = None
-    non_blocking = comfy.model_management.device_supports_non_blocking(device)
+    non_blocking = comfy.model_management.device_should_use_non_blocking(input.device)
    if s.bias is not None:
-        has_function = s.bias_function is not None
-        bias = cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function)
-        if has_function:
+        bias = s.bias.to(device=input.device, dtype=input.dtype, non_blocking=non_blocking)
+        if s.bias_function is not None:
            bias = s.bias_function(bias)
-
-    has_function = s.weight_function is not None
-    weight = cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function)
-    if has_function:
+    weight = s.weight.to(device=input.device, dtype=input.dtype, non_blocking=non_blocking)
+    if s.weight_function is not None:
        weight = s.weight_function(weight)
    return weight, bias

@@ -194,26 +168,6 @@ class disable_weight_init:
            else:
                return super().forward(*args, **kwargs)

-    class Embedding(torch.nn.Embedding, CastWeightBiasOp):
-        def reset_parameters(self):
-            self.bias = None
-            return None
-
-        def forward_comfy_cast_weights(self, input, out_dtype=None):
-            output_dtype = out_dtype
-            if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16:
-                out_dtype = None
-            weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype)
-            return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)
-
-        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights:
-                return self.forward_comfy_cast_weights(*args, **kwargs)
-            else:
-                if "out_dtype" in kwargs:
-                    kwargs.pop("out_dtype")
-                return super().forward(*args, **kwargs)
-
    @classmethod
    def conv_nd(s, dims, *args, **kwargs):
        if dims == 2:
@@ -248,62 +202,3 @@ class manual_cast(disable_weight_init):

    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
        comfy_cast_weights = True
-
-    class Embedding(disable_weight_init.Embedding):
-        comfy_cast_weights = True
-
-
-def fp8_linear(self, input):
-    dtype = self.weight.dtype
-    if dtype not in [torch.float8_e4m3fn]:
-        return None
-
-    if len(input.shape) == 3:
-        inn = input.reshape(-1, input.shape[2]).to(dtype)
-        non_blocking = comfy.model_management.device_supports_non_blocking(input.device)
-        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input.dtype)
-        w = w.t()
-
-        scale_weight = self.scale_weight
-        scale_input = self.scale_input
-        if scale_weight is None:
-            scale_weight = torch.ones((1), device=input.device, dtype=torch.float32)
-            if scale_input is None:
-                scale_input = scale_weight
-        if scale_input is None:
-            scale_input = torch.ones((1), device=input.device, dtype=torch.float32)
-
-        if bias is not None:
-            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
-        else:
-            o = torch._scaled_mm(inn, w, out_dtype=input.dtype, scale_a=scale_input, scale_b=scale_weight)
-
-        if isinstance(o, tuple):
-            o = o[0]
-
-        return o.reshape((-1, input.shape[1], self.weight.shape[0]))
-    return None
-
-class fp8_ops(manual_cast):
-    class Linear(manual_cast.Linear):
-        def reset_parameters(self):
-            self.scale_weight = None
-            self.scale_input = None
-            return None
-
-        def forward_comfy_cast_weights(self, input):
-            out = fp8_linear(self, input)
-            if out is not None:
-                return out
-
-            weight, bias = cast_bias_weight(self, input)
-            return torch.nn.functional.linear(input, weight, bias)
-
-
-def pick_operations(weight_dtype, compute_dtype, load_device=None):
-    if compute_dtype is None or weight_dtype == compute_dtype:
-        return disable_weight_init
-    if args.fast:
-        if comfy.model_management.supports_fp8_compute(load_device):
-            return fp8_ops
-    return manual_cast
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -61,9 +61,7 @@ def prepare_sampling(model, noise_shape, conds):
    device = model.load_device
    real_model = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
-    memory_required = model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory
-    minimum_memory_required = model.memory_required([noise_shape[0]] + list(noise_shape[1:])) + inference_memory
-    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required)
+    comfy.model_management.load_models_gpu([model] + models, model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory)
    real_model = model.model

    return real_model, conds, models
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -171,7 +171,7 @@ def calc_cond_batch(model, conds, x_in, timestep, model_options):
        for i in range(1, len(to_batch_temp) + 1):
            batch_amount = to_batch_temp[:len(to_batch_temp)//i]
            input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
-            if model.memory_required(input_shape) * 1.5 < free_memory:
+            if model.memory_required(input_shape) < free_memory:
                to_batch = batch_amount
                break

--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -17,14 +17,11 @@ from . import diffusers_convert
 from . import model_detection

 from . import sd1_clip
+from . import sd2_clip
 from . import sdxl_clip
-import comfy.text_encoders.sd2_clip
 import comfy.text_encoders.sd3_clip
 import comfy.text_encoders.sa_t5
 import comfy.text_encoders.aura_t5
-import comfy.text_encoders.hydit
-import comfy.text_encoders.flux
-import comfy.text_encoders.long_clipl

 import comfy.model_patcher
 import comfy.lora
@@ -63,7 +60,7 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):


 class CLIP:
-    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}):
+    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}):
        if no_init:
            return
        params = target.params.copy()
@@ -72,29 +69,20 @@ class CLIP:

        load_device = model_management.text_encoder_device()
        offload_device = model_management.text_encoder_offload_device()
-        dtype = model_options.get("dtype", None)
-        if dtype is None:
-            dtype = model_management.text_encoder_dtype(load_device)
-
+        params['device'] = offload_device
+        dtype = model_management.text_encoder_dtype(load_device)
        params['dtype'] = dtype
-        params['device'] = model_management.text_encoder_initial_device(load_device, offload_device, parameters * model_management.dtype_size(dtype))
-        params['model_options'] = model_options

        self.cond_stage_model = clip(**(params))

        for dt in self.cond_stage_model.dtypes:
            if not model_management.supports_cast(load_device, dt):
                load_device = offload_device
-                if params['device'] != offload_device:
-                    self.cond_stage_model.to(offload_device)
-                    logging.warning("Had to shift TE back.")

        self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
-        if params['device'] == load_device:
-            model_management.load_models_gpu([self.patcher], force_full_load=True)
        self.layer_idx = None
-        logging.debug("CLIP model load device: {}, offload device: {}, current: {}".format(load_device, offload_device, params['device']))
+        logging.debug("CLIP model load device: {}, offload device: {}".format(load_device, offload_device))

    def clone(self):
        n = CLIP(no_init=True)
@@ -397,17 +385,12 @@ class CLIPType(Enum):
    STABLE_CASCADE = 2
    SD3 = 3
    STABLE_AUDIO = 4
-    HUNYUAN_DIT = 5
-    FLUX = 6

-def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
+def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION):
    clip_data = []
    for p in ckpt_paths:
        clip_data.append(comfy.utils.load_torch_file(p, safe_load=True))
-    return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)

-def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
-    clip_data = state_dicts
    class EmptyClass:
        pass

@@ -429,8 +412,8 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                clip_target.clip = sdxl_clip.SDXLRefinerClipModel
                clip_target.tokenizer = sdxl_clip.SDXLTokenizer
        elif "text_model.encoder.layers.22.mlp.fc1.weight" in clip_data[0]:
-            clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel
-            clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer
+            clip_target.clip = sd2_clip.SD2ClipModel
+            clip_target.tokenizer = sd2_clip.SD2Tokenizer
        elif "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in clip_data[0]:
            weight = clip_data[0]["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"]
            dtype_t5 = weight.dtype
@@ -444,29 +427,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
            clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
        else:
-            w = clip_data[0].get("text_model.embeddings.position_embedding.weight", None)
-            if w is not None and w.shape[0] == 248:
-                clip_target.clip = comfy.text_encoders.long_clipl.LongClipModel
-                clip_target.tokenizer = comfy.text_encoders.long_clipl.LongClipTokenizer
-            else:
-                clip_target.clip = sd1_clip.SD1ClipModel
-                clip_target.tokenizer = sd1_clip.SD1Tokenizer
+            clip_target.clip = sd1_clip.SD1ClipModel
+            clip_target.tokenizer = sd1_clip.SD1Tokenizer
    elif len(clip_data) == 2:
        if clip_type == CLIPType.SD3:
            clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=True, t5=False)
            clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
-        elif clip_type == CLIPType.HUNYUAN_DIT:
-            clip_target.clip = comfy.text_encoders.hydit.HyditModel
-            clip_target.tokenizer = comfy.text_encoders.hydit.HyditTokenizer
-        elif clip_type == CLIPType.FLUX:
-            weight_name = "encoder.block.23.layer.1.DenseReluDense.wi_1.weight"
-            weight = clip_data[0].get(weight_name, clip_data[1].get(weight_name, None))
-            dtype_t5 = None
-            if weight is not None:
-                dtype_t5 = weight.dtype
-
-            clip_target.clip = comfy.text_encoders.flux.flux_clip(dtype_t5=dtype_t5)
-            clip_target.tokenizer = comfy.text_encoders.flux.FluxTokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
@@ -474,11 +440,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        clip_target.clip = comfy.text_encoders.sd3_clip.SD3ClipModel
        clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer

-    parameters = 0
-    for c in clip_data:
-        parameters += comfy.utils.calculate_parameters(c)
-
-    clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, model_options=model_options)
+    clip = CLIP(clip_target, embedding_directory=embedding_directory)
    for c in clip_data:
        m, u = clip.load_sd(c)
        if len(m) > 0:
@@ -520,39 +482,25 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl

    return (model, clip, vae)

-def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
+def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True):
    sd = comfy.utils.load_torch_file(ckpt_path)
-    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options)
-    if out is None:
-        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
-    return out
-
-def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
+    sd_keys = sd.keys()
    clip = None
    clipvision = None
    vae = None
    model = None
    model_patcher = None
+    clip_target = None

    diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
    parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
-    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
    load_device = model_management.get_torch_device()

    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
    if model_config is None:
-        return None
-
-    unet_weight_dtype = list(model_config.supported_inference_dtypes)
-    if weight_dtype is not None:
-        unet_weight_dtype.append(weight_dtype)
-
-    model_config.custom_operations = model_options.get("custom_operations", None)
-    unet_dtype = model_options.get("weight_dtype", None)
-
-    if unet_dtype is None:
-        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)
+        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))

+    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)

@@ -576,8 +524,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
        if clip_target is not None:
            clip_sd = model_config.process_clip_state_dict(sd)
            if len(clip_sd) > 0:
-                parameters = comfy.utils.calculate_parameters(clip_sd)
-                clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, model_options=te_model_options)
+                clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd)
                m, u = clip.load_sd(clip_sd, full_model=True)
                if len(m) > 0:
                    m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m))
@@ -596,16 +543,15 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
        logging.debug("left over keys: {}".format(left_over))

    if output_model:
-        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device())
+        model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device(), current_device=inital_load_device)
        if inital_load_device != torch.device("cpu"):
            logging.info("loaded straight to GPU")
-            model_management.load_models_gpu([model_patcher], force_full_load=True)
+            model_management.load_model_gpu(model_patcher)

    return (model_patcher, clip, vae, clipvision)


-def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffusers or regular format
-    dtype = model_options.get("dtype", None)
+def load_unet_state_dict(sd): #load unet in diffusers or regular format

    #Allow loading unets from checkpoint files
    diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
@@ -614,6 +560,7 @@ def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffuse
        sd = temp_sd

    parameters = comfy.utils.calculate_parameters(sd)
+    unet_dtype = model_management.unet_dtype(model_params=parameters)
    load_device = model_management.get_torch_device()
    model_config = model_detection.model_config_from_unet(sd, "")

@@ -640,14 +587,9 @@ def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffuse
                    logging.warning("{} {}".format(diffusers_keys[k], k))

    offload_device = model_management.unet_offload_device()
-    if dtype is None:
-        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
-    else:
-        unet_dtype = dtype
-
+    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
-    model_config.custom_operations = model_options.get("custom_operations", None)
    model = model_config.get_model(new_sd, "")
    model = model.to(offload_device)
    model.load_model_weights(new_sd, "")
@@ -656,36 +598,24 @@ def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffuse
        logging.info("left over keys in unet: {}".format(left_over))
    return comfy.model_patcher.ModelPatcher(model, load_device=load_device, offload_device=offload_device)

-
-def load_diffusion_model(unet_path, model_options={}):
+def load_unet(unet_path):
    sd = comfy.utils.load_torch_file(unet_path)
-    model = load_diffusion_model_state_dict(sd, model_options=model_options)
+    model = load_unet_state_dict(sd)
    if model is None:
        logging.error("ERROR UNSUPPORTED UNET {}".format(unet_path))
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path))
    return model

-def load_unet(unet_path, dtype=None):
-    print("WARNING: the load_unet function has been deprecated and will be removed please switch to: load_diffusion_model")
-    return load_diffusion_model(unet_path, model_options={"dtype": dtype})
-
-def load_unet_state_dict(sd, dtype=None):
-    print("WARNING: the load_unet_state_dict function has been deprecated and will be removed please switch to: load_diffusion_model_state_dict")
-    return load_diffusion_model_state_dict(sd, model_options={"dtype": dtype})
-
 def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, metadata=None, extra_keys={}):
    clip_sd = None
    load_models = [model]
    if clip is not None:
        load_models.append(clip.load_model())
        clip_sd = clip.get_sd()
-    vae_sd = None
-    if vae is not None:
-        vae_sd = vae.get_sd()

    model_management.load_models_gpu(load_models, force_patch_weights=True)
    clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None
-    sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd)
+    sd = model.model.state_dict_for_saving(clip_sd, vae.get_sd(), clip_vision_sd)
    for k in extra_keys:
        sd[k] = extra_keys[k]

--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -75,6 +75,7 @@ class ClipTokenWeightEncoder:
        return r

 class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
    LAYERS = [
        "last",
        "pooled",
@@ -83,7 +84,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77,
                 freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=comfy.clip_model.CLIPTextModel,
                 special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False,
-                 return_projected_pooled=True, return_attention_masks=False, model_options={}):  # clip-vit-base-patch32
+                 return_projected_pooled=True, return_attention_masks=False):  # clip-vit-base-patch32
        super().__init__()
        assert layer in self.LAYERS

@@ -93,12 +94,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        with open(textmodel_json_config) as f:
            config = json.load(f)

-        operations = model_options.get("custom_operations", None)
-        if operations is None:
-            operations = comfy.ops.manual_cast
-
-        self.operations = operations
-        self.transformer = model_class(config, dtype, device, self.operations)
+        self.transformer = model_class(config, dtype, device, comfy.ops.manual_cast)
        self.num_layers = self.transformer.num_layers

        self.max_length = max_length
@@ -144,13 +140,15 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):

    def set_up_textual_embeddings(self, tokens, current_embeds):
        out_tokens = []
-        next_new_token = token_dict_size = current_embeds.weight.shape[0]
+        next_new_token = token_dict_size = current_embeds.weight.shape[0] - 1
        embedding_weights = []

        for x in tokens:
            tokens_temp = []
            for y in x:
                if isinstance(y, numbers.Integral):
+                    if y == token_dict_size: #EOS token
+                        y = -1
                    tokens_temp += [int(y)]
                else:
                    if y.shape[0] == current_embeds.weight.shape[1]:
@@ -165,11 +163,12 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):

        n = token_dict_size
        if len(embedding_weights) > 0:
-            new_embedding = self.operations.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
-            new_embedding.weight[:token_dict_size] = current_embeds.weight
+            new_embedding = torch.nn.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
+            new_embedding.weight[:token_dict_size] = current_embeds.weight[:-1]
            for x in embedding_weights:
                new_embedding.weight[n] = x
                n += 1
+            new_embedding.weight[n] = current_embeds.weight[-1] #EOS embedding
            self.transformer.set_input_embeddings(new_embedding)

        processed_tokens = []
@@ -198,7 +197,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        if self.enable_attention_masks:
            attention_mask_model = attention_mask

-        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state)
        self.transformer.set_input_embeddings(backup_embeds)

        if self.layer == "last":
@@ -316,17 +315,6 @@ def expand_directory_list(directories):
            dirs.add(root)
    return list(dirs)

-def bundled_embed(embed, prefix, suffix): #bundled embedding in lora format
-    i = 0
-    out_list = []
-    for k in embed:
-        if k.startswith(prefix) and k.endswith(suffix):
-            out_list.append(embed[k])
-    if len(out_list) == 0:
-        return None
-
-    return torch.cat(out_list, dim=0)
-
 def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=None):
    if isinstance(embedding_directory, str):
        embedding_directory = [embedding_directory]
@@ -393,12 +381,8 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
        elif embed_key is not None and embed_key in embed:
            embed_out = embed[embed_key]
        else:
-            embed_out = bundled_embed(embed, 'bundle_emb.', '.string_to_param.*')
-            if embed_out is None:
-                embed_out = bundled_embed(embed, 'bundle_emb.', '.{}'.format(embed_key))
-            if embed_out is None:
-                values = embed.values()
-                embed_out = next(iter(values))
+            values = embed.values()
+            embed_out = next(iter(values))
    return embed_out

 class SDTokenizer:
@@ -555,12 +539,8 @@ class SD1Tokenizer:
    def state_dict(self):
        return {}

-class SD1CheckpointClipModel(SDClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
-        super().__init__(device=device, return_projected_pooled=False, dtype=dtype, model_options=model_options)
-
 class SD1ClipModel(torch.nn.Module):
-    def __init__(self, device="cpu", dtype=None, model_options={}, clip_name="l", clip_model=SD1CheckpointClipModel, name=None, **kwargs):
+    def __init__(self, device="cpu", dtype=None, clip_name="l", clip_model=SDClipModel, name=None, **kwargs):
        super().__init__()

        if name is not None:
@@ -570,7 +550,7 @@ class SD1ClipModel(torch.nn.Module):
            self.clip_name = clip_name
            self.clip = "clip_{}".format(self.clip_name)

-        setattr(self, self.clip, clip_model(device=device, dtype=dtype, model_options=model_options, **kwargs))
+        setattr(self, self.clip, clip_model(device=device, dtype=dtype, **kwargs))

        self.dtypes = set()
        if dtype is not None:
--- a/comfy/sd1_clip_config.json
+++ b/comfy/sd1_clip_config.json
@@ -6,7 +6,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 49407,
+  "eos_token_id": 2,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "initializer_factor": 1.0,
--- a/comfy/text_encoders/sd2_clip.py
+++ b/comfy/text_encoders/sd2_clip.py
@@ -2,13 +2,13 @@ from comfy import sd1_clip
 import os

 class SD2ClipHModel(sd1_clip.SDClipModel):
-    def __init__(self, arch="ViT-H-14", device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None, model_options={}):
+    def __init__(self, arch="ViT-H-14", device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None):
        if layer == "penultimate":
            layer="hidden"
            layer_idx=-2

        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd2_clip_config.json")
-        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0}, return_projected_pooled=True, model_options=model_options)
+        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0})

 class SD2ClipHTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
@@ -19,5 +19,5 @@ class SD2Tokenizer(sd1_clip.SD1Tokenizer):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="h", tokenizer=SD2ClipHTokenizer)

 class SD2ClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
-        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_name="h", clip_model=SD2ClipHModel, **kwargs)
+    def __init__(self, device="cpu", dtype=None, **kwargs):
+        super().__init__(device=device, dtype=dtype, clip_name="h", clip_model=SD2ClipHModel, **kwargs)
--- a/comfy/text_encoders/sd2_clip_config.json
+++ b/comfy/text_encoders/sd2_clip_config.json
@@ -5,7 +5,7 @@
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
-  "eos_token_id": 49407,
+  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_size": 1024,
  "initializer_factor": 1.0,
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@@ -3,14 +3,14 @@ import torch
 import os

 class SDXLClipG(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None, model_options={}):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None):
        if layer == "penultimate":
            layer="hidden"
            layer_idx=-2

        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
-                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False, return_projected_pooled=True, model_options=model_options)
+                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)

    def load_sd(self, sd):
        return super().load_sd(sd)
@@ -38,10 +38,10 @@ class SDXLTokenizer:
        return {}

 class SDXLClipModel(torch.nn.Module):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
+    def __init__(self, device="cpu", dtype=None):
        super().__init__()
-        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, model_options=model_options)
-        self.clip_g = SDXLClipG(device=device, dtype=dtype, model_options=model_options)
+        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False)
+        self.clip_g = SDXLClipG(device=device, dtype=dtype)
        self.dtypes = set([dtype])

    def set_clip_options(self, options):
@@ -66,8 +66,8 @@ class SDXLClipModel(torch.nn.Module):
            return self.clip_l.load_sd(sd)

 class SDXLRefinerClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
-        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG, model_options=model_options)
+    def __init__(self, device="cpu", dtype=None):
+        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG)


 class StableCascadeClipGTokenizer(sd1_clip.SDTokenizer):
@@ -79,14 +79,14 @@ class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="g", tokenizer=StableCascadeClipGTokenizer)

 class StableCascadeClipG(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None, model_options={}):
+    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
-                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True, return_projected_pooled=True, model_options=model_options)
+                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True)

    def load_sd(self, sd):
        return super().load_sd(sd)

 class StableCascadeClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
-        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG, model_options=model_options)
+    def __init__(self, device="cpu", dtype=None):
+        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -3,13 +3,11 @@ from . import model_base
 from . import utils

 from . import sd1_clip
+from . import sd2_clip
 from . import sdxl_clip
-import comfy.text_encoders.sd2_clip
 import comfy.text_encoders.sd3_clip
 import comfy.text_encoders.sa_t5
 import comfy.text_encoders.aura_t5
-import comfy.text_encoders.hydit
-import comfy.text_encoders.flux

 from . import supported_models_base
 from . import latent_formats
@@ -31,7 +29,6 @@ class SD15(supported_models_base.BASE):
    }

    latent_format = latent_formats.SD15
-    memory_usage_factor = 1.0

    def process_clip_state_dict(self, state_dict):
        k = list(state_dict.keys())
@@ -78,7 +75,6 @@ class SD20(supported_models_base.BASE):
    }

    latent_format = latent_formats.SD15
-    memory_usage_factor = 1.0

    def model_type(self, state_dict, prefix=""):
        if self.unet_config["in_channels"] == 4: #SD2.0 inpainting models are not v prediction
@@ -104,7 +100,7 @@ class SD20(supported_models_base.BASE):
        return state_dict

    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.sd2_clip.SD2Tokenizer, comfy.text_encoders.sd2_clip.SD2ClipModel)
+        return supported_models_base.ClipTarget(sd2_clip.SD2Tokenizer, sd2_clip.SD2ClipModel)

 class SD21UnclipL(SD20):
    unet_config = {
@@ -142,7 +138,6 @@ class SDXLRefiner(supported_models_base.BASE):
    }

    latent_format = latent_formats.SDXL
-    memory_usage_factor = 1.0

    def get_model(self, state_dict, prefix="", device=None):
        return model_base.SDXLRefiner(self, device=device)
@@ -181,8 +176,6 @@ class SDXL(supported_models_base.BASE):

    latent_format = latent_formats.SDXL

-    memory_usage_factor = 0.8
-
    def model_type(self, state_dict, prefix=""):
        if 'edm_mean' in state_dict and 'edm_std' in state_dict: #Playground V2.5
            self.latent_format = latent_formats.SDXL_Playground_2_5()
@@ -510,9 +503,6 @@ class SD3(supported_models_base.BASE):

    unet_extra_config = {}
    latent_format = latent_formats.SD3
-
-    memory_usage_factor = 1.2
-
    text_encoder_key_prefix = ["text_encoders."]

    def get_model(self, state_dict, prefix="", device=None):
@@ -590,91 +580,6 @@ class AuraFlow(supported_models_base.BASE):
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.aura_t5.AuraT5Tokenizer, comfy.text_encoders.aura_t5.AuraT5Model)

-class HunyuanDiT(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "hydit",
-    }
-
-    unet_extra_config = {
-        "attn_precision": torch.float32,
-    }
-
-    sampling_settings = {
-        "linear_start": 0.00085,
-        "linear_end": 0.018,
-    }
-
-    latent_format = latent_formats.SDXL
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanDiT(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.hydit.HyditTokenizer, comfy.text_encoders.hydit.HyditModel)
-
-class HunyuanDiT1(HunyuanDiT):
-    unet_config = {
-        "image_model": "hydit1",
-    }
-
-    unet_extra_config = {}
-
-    sampling_settings = {
-        "linear_start" : 0.00085,
-        "linear_end" : 0.03,
-    }
-
-class Flux(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "flux",
-        "guidance_embed": True,
-    }
-
-    sampling_settings = {
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Flux
-
-    memory_usage_factor = 2.8
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Flux(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_key = "{}t5xxl.transformer.encoder.final_layer_norm.weight".format(pref)
-        dtype_t5 = None
-        if t5_key in state_dict:
-            dtype_t5 = state_dict[t5_key].dtype
-        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(dtype_t5=dtype_t5))
-
-class FluxSchnell(Flux):
-    unet_config = {
-        "image_model": "flux",
-        "guidance_embed": False,
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 1.0,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device)
-        return out
-
-
-models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, HunyuanDiT, HunyuanDiT1, Flux, FluxSchnell]
+models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow]

 models += [SVD_img2vid]
--- a/comfy/supported_models_base.py
+++ b/comfy/supported_models_base.py
@@ -1,21 +1,3 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Comfy
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-
 import torch
 from . import model_base
 from . import utils
@@ -45,10 +27,7 @@ class BASE:
    text_encoder_key_prefix = ["cond_stage_model."]
    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]

-    memory_usage_factor = 2.0
-
    manual_cast_dtype = None
-    custom_operations = None

    @classmethod
    def matches(s, unet_config, state_dict=None):
--- a/comfy/text_encoders/aura_t5.py
+++ b/comfy/text_encoders/aura_t5.py
@@ -4,9 +4,9 @@ import comfy.text_encoders.t5
 import os

 class PT5XlModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_config_xl.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True, model_options=model_options)
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True)

 class PT5XlTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
@@ -18,5 +18,5 @@ class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="pile_t5xl", tokenizer=PT5XlTokenizer)

 class AuraT5Model(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
-        super().__init__(device=device, dtype=dtype, model_options=model_options, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
+    def __init__(self, device="cpu", dtype=None, **kwargs):
+        super().__init__(device=device, dtype=dtype, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -1,140 +0,0 @@
-import torch
-from comfy.ldm.modules.attention import optimized_attention_for_device
-import comfy.ops
-
-class BertAttention(torch.nn.Module):
-    def __init__(self, embed_dim, heads, dtype, device, operations):
-        super().__init__()
-
-        self.heads = heads
-        self.query = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.key = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.value = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-
-
-    def forward(self, x, mask=None, optimized_attention=None):
-        q = self.query(x)
-        k = self.key(x)
-        v = self.value(x)
-
-        out = optimized_attention(q, k, v, self.heads, mask)
-        return out
-
-class BertOutput(torch.nn.Module):
-    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
-        self.LayerNorm = operations.LayerNorm(output_dim, eps=layer_norm_eps, dtype=dtype, device=device)
-        # self.dropout = nn.Dropout(0.0)
-
-    def forward(self, x, y):
-        x = self.dense(x)
-        # hidden_states = self.dropout(hidden_states)
-        x = self.LayerNorm(x + y)
-        return x
-
-class BertAttentionBlock(torch.nn.Module):
-    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.self = BertAttention(embed_dim, heads, dtype, device, operations)
-        self.output = BertOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
-
-    def forward(self, x, mask, optimized_attention):
-        y = self.self(x, mask, optimized_attention)
-        return self.output(y, x)
-
-class BertIntermediate(torch.nn.Module):
-    def __init__(self, embed_dim, intermediate_dim, dtype, device, operations):
-        super().__init__()
-        self.dense = operations.Linear(embed_dim, intermediate_dim, dtype=dtype, device=device)
-
-    def forward(self, x):
-        x = self.dense(x)
-        return torch.nn.functional.gelu(x)
-
-
-class BertBlock(torch.nn.Module):
-    def __init__(self, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.attention = BertAttentionBlock(embed_dim, heads, layer_norm_eps, dtype, device, operations)
-        self.intermediate = BertIntermediate(embed_dim, intermediate_dim, dtype, device, operations)
-        self.output = BertOutput(intermediate_dim, embed_dim, layer_norm_eps, dtype, device, operations)
-
-    def forward(self, x, mask, optimized_attention):
-        x = self.attention(x, mask, optimized_attention)
-        y = self.intermediate(x)
-        return self.output(y, x)
-
-class BertEncoder(torch.nn.Module):
-    def __init__(self, num_layers, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.layer = torch.nn.ModuleList([BertBlock(embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations) for i in range(num_layers)])
-
-    def forward(self, x, mask=None, intermediate_output=None):
-        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
-
-        if intermediate_output is not None:
-            if intermediate_output < 0:
-                intermediate_output = len(self.layer) + intermediate_output
-
-        intermediate = None
-        for i, l in enumerate(self.layer):
-            x = l(x, mask, optimized_attention)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        return x, intermediate
-
-class BertEmbeddings(torch.nn.Module):
-    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size, pad_token_id, embed_dim, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.word_embeddings = operations.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id, dtype=dtype, device=device)
-        self.position_embeddings = operations.Embedding(max_position_embeddings, embed_dim, dtype=dtype, device=device)
-        self.token_type_embeddings = operations.Embedding(type_vocab_size, embed_dim, dtype=dtype, device=device)
-
-        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, input_tokens, token_type_ids=None, dtype=None):
-        x = self.word_embeddings(input_tokens, out_dtype=dtype)
-        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
-        if token_type_ids is not None:
-            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
-        else:
-            x += comfy.ops.cast_to_input(self.token_type_embeddings.weight[0], x)
-        x = self.LayerNorm(x)
-        return x
-
-
-class BertModel_(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        embed_dim = config_dict["hidden_size"]
-        layer_norm_eps = config_dict["layer_norm_eps"]
-
-        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
-        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)
-
-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        x = self.embeddings(input_tokens, dtype=dtype)
-        mask = None
-        if attention_mask is not None:
-            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
-
-        x, i = self.encoder(x, mask, intermediate_output)
-        return x, i
-
-
-class BertModel(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        self.bert = BertModel_(config_dict, dtype, device, operations)
-        self.num_layers = config_dict["num_hidden_layers"]
-
-    def get_input_embeddings(self):
-        return self.bert.embeddings.word_embeddings
-
-    def set_input_embeddings(self, embeddings):
-        self.bert.embeddings.word_embeddings = embeddings
-
-    def forward(self, *args, **kwargs):
-        return self.bert(*args, **kwargs)
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -1,71 +0,0 @@
-from comfy import sd1_clip
-import comfy.text_encoders.t5
-import comfy.model_management
-from transformers import T5TokenizerFast
-import torch
-import os
-
-class T5XXLModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
-        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, model_options=model_options)
-
-class T5XXLTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
-
-
-class FluxTokenizer:
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory)
-        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
-
-    def tokenize_with_weights(self, text:str, return_word_ids=False):
-        out = {}
-        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
-        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
-        return out
-
-    def untokenize(self, token_weight_pair):
-        return self.clip_l.untokenize(token_weight_pair)
-
-    def state_dict(self):
-        return {}
-
-
-class FluxClipModel(torch.nn.Module):
-    def __init__(self, dtype_t5=None, device="cpu", dtype=None, model_options={}):
-        super().__init__()
-        dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
-        self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
-        self.t5xxl = T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options)
-        self.dtypes = set([dtype, dtype_t5])
-
-    def set_clip_options(self, options):
-        self.clip_l.set_clip_options(options)
-        self.t5xxl.set_clip_options(options)
-
-    def reset_clip_options(self):
-        self.clip_l.reset_clip_options()
-        self.t5xxl.reset_clip_options()
-
-    def encode_token_weights(self, token_weight_pairs):
-        token_weight_pairs_l = token_weight_pairs["l"]
-        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
-
-        t5_out, t5_pooled = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
-        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
-        return t5_out, l_pooled
-
-    def load_sd(self, sd):
-        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
-            return self.clip_l.load_sd(sd)
-        else:
-            return self.t5xxl.load_sd(sd)
-
-def flux_clip(dtype_t5=None):
-    class FluxClipModel_(FluxClipModel):
-        def __init__(self, device="cpu", dtype=None, model_options={}):
-            super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
-    return FluxClipModel_
--- a/comfy/text_encoders/hydit.py
+++ b/comfy/text_encoders/hydit.py
@@ -1,79 +0,0 @@
-from comfy import sd1_clip
-from transformers import BertTokenizer
-from .spiece_tokenizer import SPieceTokenizer
-from .bert import BertModel
-import comfy.text_encoders.t5
-import os
-import torch
-
-class HyditBertModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
-        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 101, "end": 102, "pad": 0}, model_class=BertModel, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
-
-class HyditBertTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='chinese_roberta', tokenizer_class=BertTokenizer, pad_to_max_length=False, max_length=512, min_length=77)
-
-
-class MT5XLModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
-        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_config_xl.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
-
-class MT5XLTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        #tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_tokenizer"), "spiece.model")
-        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=2048, embedding_key='mt5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
-
-    def state_dict(self):
-        return {"spiece_model": self.tokenizer.serialize_model()}
-
-class HyditTokenizer:
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        mt5_tokenizer_data = tokenizer_data.get("mt5xl.spiece_model", None)
-        self.hydit_clip = HyditBertTokenizer(embedding_directory=embedding_directory)
-        self.mt5xl = MT5XLTokenizer(tokenizer_data={"spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)
-
-    def tokenize_with_weights(self, text:str, return_word_ids=False):
-        out = {}
-        out["hydit_clip"] = self.hydit_clip.tokenize_with_weights(text, return_word_ids)
-        out["mt5xl"] = self.mt5xl.tokenize_with_weights(text, return_word_ids)
-        return out
-
-    def untokenize(self, token_weight_pair):
-        return self.hydit_clip.untokenize(token_weight_pair)
-
-    def state_dict(self):
-        return {"mt5xl.spiece_model": self.mt5xl.state_dict()["spiece_model"]}
-
-class HyditModel(torch.nn.Module):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
-        super().__init__()
-        self.hydit_clip = HyditBertModel(dtype=dtype, model_options=model_options)
-        self.mt5xl = MT5XLModel(dtype=dtype, model_options=model_options)
-
-        self.dtypes = set()
-        if dtype is not None:
-            self.dtypes.add(dtype)
-
-    def encode_token_weights(self, token_weight_pairs):
-        hydit_out = self.hydit_clip.encode_token_weights(token_weight_pairs["hydit_clip"])
-        mt5_out = self.mt5xl.encode_token_weights(token_weight_pairs["mt5xl"])
-        return hydit_out[0], hydit_out[1], {"attention_mask": hydit_out[2]["attention_mask"], "conditioning_mt5xl": mt5_out[0], "attention_mask_mt5xl": mt5_out[2]["attention_mask"]}
-
-    def load_sd(self, sd):
-        if "bert.encoder.layer.0.attention.self.query.weight" in sd:
-            return self.hydit_clip.load_sd(sd)
-        else:
-            return self.mt5xl.load_sd(sd)
-
-    def set_clip_options(self, options):
-        self.hydit_clip.set_clip_options(options)
-        self.mt5xl.set_clip_options(options)
-
-    def reset_clip_options(self):
-        self.hydit_clip.reset_clip_options()
-        self.mt5xl.reset_clip_options()
--- a/comfy/text_encoders/hydit_clip.json
+++ b/comfy/text_encoders/hydit_clip.json
@@ -1,35 +0,0 @@
-{
-  "_name_or_path": "hfl/chinese-roberta-wwm-ext-large",
-  "architectures": [
-    "BertModel"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 0,
-  "classifier_dropout": null,
-  "directionality": "bidi",
-  "eos_token_id": 2,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 1024,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 24,
-  "output_past": true,
-  "pad_token_id": 0,
-  "pooler_fc_size": 768,
-  "pooler_num_attention_heads": 12,
-  "pooler_num_fc_layers": 3,
-  "pooler_size_per_head": 128,
-  "pooler_type": "first_token_transform",
-  "position_embedding_type": "absolute",
-  "torch_dtype": "float32",
-  "transformers_version": "4.22.1",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 47020
-}
-
--- a/comfy/text_encoders/hydit_clip_tokenizer/special_tokens_map.json
+++ b/comfy/text_encoders/hydit_clip_tokenizer/special_tokens_map.json
@@ -1,7 +0,0 @@
-{
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}
--- a/comfy/text_encoders/hydit_clip_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/hydit_clip_tokenizer/tokenizer_config.json
@@ -1,16 +0,0 @@
-{
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": true,
-  "mask_token": "[MASK]",
-  "name_or_path": "hfl/chinese-roberta-wwm-ext",
-  "never_split": null,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "special_tokens_map_file": "/home/chenweifeng/.cache/huggingface/hub/models--hfl--chinese-roberta-wwm-ext/snapshots/5c58d0b8ec1d9014354d691c538661bf00bfdb44/special_tokens_map.json",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]",
-  "model_max_length": 77
-}
--- a/comfy/text_encoders/hydit_clip_tokenizer/vocab.txt
+++ b/comfy/text_encoders/hydit_clip_tokenizer/vocab.txt
--- a/comfy/text_encoders/long_clipl.json
+++ b/comfy/text_encoders/long_clipl.json
@@ -1,25 +0,0 @@
-{
-  "_name_or_path": "openai/clip-vit-large-patch14",
-  "architectures": [
-    "CLIPTextModel"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 0,
-  "dropout": 0.0,
-  "eos_token_id": 49407,
-  "hidden_act": "quick_gelu",
-  "hidden_size": 768,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 248,
-  "model_type": "clip_text_model",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 1,
-  "projection_dim": 768,
-  "torch_dtype": "float32",
-  "transformers_version": "4.24.0",
-  "vocab_size": 49408
-}
--- a/comfy/text_encoders/long_clipl.py
+++ b/comfy/text_encoders/long_clipl.py
@@ -1,19 +0,0 @@
-from comfy import sd1_clip
-import os
-
-class LongClipTokenizer_(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(max_length=248, embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
-
-class LongClipModel_(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
-        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "long_clipl.json")
-        super().__init__(device=device, textmodel_json_config=textmodel_json_config, return_projected_pooled=False, dtype=dtype, model_options=model_options)
-
-class LongClipTokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, tokenizer=LongClipTokenizer_)
-
-class LongClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
-        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_model=LongClipModel_, **kwargs)
--- a/comfy/text_encoders/mt5_config_xl.json
+++ b/comfy/text_encoders/mt5_config_xl.json
@@ -1,22 +0,0 @@
-{
-  "d_ff": 5120,
-  "d_kv": 64,
-  "d_model": 2048,
-  "decoder_start_token_id": 0,
-  "dropout_rate": 0.1,
-  "eos_token_id": 1,
-  "dense_act_fn": "gelu_pytorch_tanh",
-  "initializer_factor": 1.0,
-  "is_encoder_decoder": true,
-  "is_gated_act": true,
-  "layer_norm_epsilon": 1e-06,
-  "model_type": "mt5",
-  "num_decoder_layers": 24,
-  "num_heads": 32,
-  "num_layers": 24,
-  "output_past": true,
-  "pad_token_id": 0,
-  "relative_attention_num_buckets": 32,
-  "tie_word_embeddings": false,
-  "vocab_size": 250112
-}
--- a/comfy/text_encoders/sa_t5.py
+++ b/comfy/text_encoders/sa_t5.py
@@ -4,9 +4,9 @@ import comfy.text_encoders.t5
 import os

 class T5BaseModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_base.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, model_options=model_options, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True)
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True)

 class T5BaseTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
@@ -18,5 +18,5 @@ class SAT5Tokenizer(sd1_clip.SD1Tokenizer):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5base", tokenizer=T5BaseTokenizer)

 class SAT5Model(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
-        super().__init__(device=device, dtype=dtype, model_options=model_options, name="t5base", clip_model=T5BaseModel, **kwargs)
+    def __init__(self, device="cpu", dtype=None, **kwargs):
+        super().__init__(device=device, dtype=dtype, name="t5base", clip_model=T5BaseModel, **kwargs)
--- a/comfy/text_encoders/sd3_clip.py
+++ b/comfy/text_encoders/sd3_clip.py
@@ -8,14 +8,14 @@ import comfy.model_management
 import logging

 class T5XXLModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json")
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, model_options=model_options)
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5)

 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)


 class SD3Tokenizer:
@@ -38,24 +38,31 @@ class SD3Tokenizer:
        return {}

 class SD3ClipModel(torch.nn.Module):
-    def __init__(self, clip_l=True, clip_g=True, t5=True, dtype_t5=None, device="cpu", dtype=None, model_options={}):
+    def __init__(self, clip_l=True, clip_g=True, t5=True, dtype_t5=None, device="cpu", dtype=None):
        super().__init__()
        self.dtypes = set()
        if clip_l:
-            self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, return_projected_pooled=False, model_options=model_options)
+            self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, return_projected_pooled=False)
            self.dtypes.add(dtype)
        else:
            self.clip_l = None

        if clip_g:
-            self.clip_g = sdxl_clip.SDXLClipG(device=device, dtype=dtype, model_options=model_options)
+            self.clip_g = sdxl_clip.SDXLClipG(device=device, dtype=dtype)
            self.dtypes.add(dtype)
        else:
            self.clip_g = None

        if t5:
-            dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
-            self.t5xxl = T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options)
+            if dtype_t5 is None:
+                dtype_t5 = dtype
+            elif comfy.model_management.dtype_size(dtype_t5) > comfy.model_management.dtype_size(dtype):
+                dtype_t5 = dtype
+
+            if not comfy.model_management.supports_cast(device, dtype_t5):
+                dtype_t5 = dtype
+
+            self.t5xxl = T5XXLModel(device=device, dtype=dtype_t5)
            self.dtypes.add(dtype_t5)
        else:
            self.t5xxl = None
@@ -81,7 +88,7 @@ class SD3ClipModel(torch.nn.Module):
    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs_l = token_weight_pairs["l"]
        token_weight_pairs_g = token_weight_pairs["g"]
-        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
+        token_weight_pars_t5 = token_weight_pairs["t5xxl"]
        lg_out = None
        pooled = None
        out = None
@@ -108,7 +115,7 @@ class SD3ClipModel(torch.nn.Module):
            pooled = torch.cat((l_pooled, g_pooled), dim=-1)

        if self.t5xxl is not None:
-            t5_out, t5_pooled = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
+            t5_out, t5_pooled = self.t5xxl.encode_token_weights(token_weight_pars_t5)
            if lg_out is not None:
                out = torch.cat([lg_out, t5_out], dim=-2)
            else:
@@ -132,6 +139,6 @@ class SD3ClipModel(torch.nn.Module):

 def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None):
    class SD3ClipModel_(SD3ClipModel):
-        def __init__(self, device="cpu", dtype=None, model_options={}):
-            super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
+        def __init__(self, device="cpu", dtype=None):
+            super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, device=device, dtype=dtype)
    return SD3ClipModel_
--- a/comfy/text_encoders/spiece_tokenizer.py
+++ b/comfy/text_encoders/spiece_tokenizer.py
@@ -27,6 +27,3 @@ class SPieceTokenizer:
    def __call__(self, string):
        out = self.tokenizer.encode(string)
        return {"input_ids": out}
-
-    def serialize_model(self):
-        return torch.ByteTensor(list(self.tokenizer.serialized_model_proto()))
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@@ -1,7 +1,6 @@
 import torch
 import math
 from comfy.ldm.modules.attention import optimized_attention_for_device
-import comfy.ops

 class T5LayerNorm(torch.nn.Module):
    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None, operations=None):
@@ -12,7 +11,7 @@ class T5LayerNorm(torch.nn.Module):
    def forward(self, x):
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.variance_epsilon)
-        return comfy.ops.cast_to_input(self.weight, x) * x
+        return self.weight.to(device=x.device, dtype=x.dtype) * x

 activations = {
    "gelu_pytorch_tanh": lambda a: torch.nn.functional.gelu(a, approximate="tanh"),
@@ -83,7 +82,7 @@ class T5Attention(torch.nn.Module):
        if relative_attention_bias:
            self.relative_attention_num_buckets = 32
            self.relative_attention_max_distance = 128
-            self.relative_attention_bias = operations.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device, dtype=dtype)
+            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)

    @staticmethod
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
@@ -133,7 +132,7 @@ class T5Attention(torch.nn.Module):
        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
        return relative_buckets

-    def compute_bias(self, query_length, key_length, device, dtype):
+    def compute_bias(self, query_length, key_length, device):
        """Compute binned relative position bias"""
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
@@ -144,7 +143,7 @@ class T5Attention(torch.nn.Module):
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
-        values = self.relative_attention_bias(relative_position_bucket, out_dtype=dtype)  # shape (query_length, key_length, num_heads)
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
        return values

@@ -153,7 +152,7 @@ class T5Attention(torch.nn.Module):
        k = self.k(x)
        v = self.v(x)
        if self.relative_attention_bias is not None:
-            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device, x.dtype)
+            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)

        if past_bias is not None:
            if mask is not None:
@@ -200,7 +199,7 @@ class T5Stack(torch.nn.Module):
        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
        # self.dropout = nn.Dropout(config.dropout_rate)

-    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -226,7 +225,7 @@ class T5(torch.nn.Module):

        self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["is_gated_act"], config_dict["num_heads"], config_dict["model_type"] != "umt5", dtype, device, operations)
        self.dtype = dtype
-        self.shared = operations.Embedding(config_dict["vocab_size"], model_dim, device=device, dtype=dtype)
+        self.shared = torch.nn.Embedding(config_dict["vocab_size"], model_dim, device=device)

    def get_input_embeddings(self):
        return self.shared
@@ -235,7 +234,5 @@ class T5(torch.nn.Module):
        self.shared = embeddings

    def forward(self, input_ids, *args, **kwargs):
-        x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
-        if self.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
-            x = torch.nan_to_num(x) #Fix for fp8 T5 base
+        x = self.shared(input_ids)
        return self.encoder(x, *args, **kwargs)
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -1,22 +1,3 @@
-"""
-    This file is part of ComfyUI.
-    Copyright (C) 2024 Comfy
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""
-
-
 import torch
 import math
 import struct
@@ -30,7 +11,7 @@ import itertools
 def load_torch_file(ckpt, safe_load=False, device=None):
    if device is None:
        device = torch.device("cpu")
-    if ckpt.lower().endswith(".safetensors") or ckpt.lower().endswith(".sft"):
+    if ckpt.lower().endswith(".safetensors"):
        sd = safetensors.torch.load_file(ckpt, device=device.type)
    else:
        if safe_load:
@@ -59,22 +40,9 @@ def calculate_parameters(sd, prefix=""):
    params = 0
    for k in sd.keys():
        if k.startswith(prefix):
-            w = sd[k]
-            params += w.nelement()
+            params += sd[k].nelement()
    return params

-def weight_dtype(sd, prefix=""):
-    dtypes = {}
-    for k in sd.keys():
-        if k.startswith(prefix):
-            w = sd[k]
-            dtypes[w.dtype] = dtypes.get(w.dtype, 0) + 1
-
-    if len(dtypes) == 0:
-        return None
-
-    return max(dtypes, key=dtypes.get)
-
 def state_dict_key_replace(state_dict, keys_to_replace):
    for x in keys_to_replace:
        if x in state_dict:
@@ -434,110 +402,6 @@ def auraflow_to_diffusers(mmdit_config, output_prefix=""):

    return key_map

-def flux_to_diffusers(mmdit_config, output_prefix=""):
-    n_double_layers = mmdit_config.get("depth", 0)
-    n_single_layers = mmdit_config.get("depth_single_blocks", 0)
-    hidden_size = mmdit_config.get("hidden_size", 0)
-
-    key_map = {}
-    for index in range(n_double_layers):
-        prefix_from = "transformer_blocks.{}".format(index)
-        prefix_to = "{}double_blocks.{}".format(output_prefix, index)
-
-        for end in ("weight", "bias"):
-            k = "{}.attn.".format(prefix_from)
-            qkv = "{}.img_attn.qkv.{}".format(prefix_to, end)
-            key_map["{}to_q.{}".format(k, end)] = (qkv, (0, 0, hidden_size))
-            key_map["{}to_k.{}".format(k, end)] = (qkv, (0, hidden_size, hidden_size))
-            key_map["{}to_v.{}".format(k, end)] = (qkv, (0, hidden_size * 2, hidden_size))
-
-            k = "{}.attn.".format(prefix_from)
-            qkv = "{}.txt_attn.qkv.{}".format(prefix_to, end)
-            key_map["{}add_q_proj.{}".format(k, end)] = (qkv, (0, 0, hidden_size))
-            key_map["{}add_k_proj.{}".format(k, end)] = (qkv, (0, hidden_size, hidden_size))
-            key_map["{}add_v_proj.{}".format(k, end)] = (qkv, (0, hidden_size * 2, hidden_size))
-
-        block_map = {
-                        "attn.to_out.0.weight": "img_attn.proj.weight",
-                        "attn.to_out.0.bias": "img_attn.proj.bias",
-                        "norm1.linear.weight": "img_mod.lin.weight",
-                        "norm1.linear.bias": "img_mod.lin.bias",
-                        "norm1_context.linear.weight": "txt_mod.lin.weight",
-                        "norm1_context.linear.bias": "txt_mod.lin.bias",
-                        "attn.to_add_out.weight": "txt_attn.proj.weight",
-                        "attn.to_add_out.bias": "txt_attn.proj.bias",
-                        "ff.net.0.proj.weight": "img_mlp.0.weight",
-                        "ff.net.0.proj.bias": "img_mlp.0.bias",
-                        "ff.net.2.weight": "img_mlp.2.weight",
-                        "ff.net.2.bias": "img_mlp.2.bias",
-                        "ff_context.net.0.proj.weight": "txt_mlp.0.weight",
-                        "ff_context.net.0.proj.bias": "txt_mlp.0.bias",
-                        "ff_context.net.2.weight": "txt_mlp.2.weight",
-                        "ff_context.net.2.bias": "txt_mlp.2.bias",
-                        "attn.norm_q.weight": "img_attn.norm.query_norm.scale",
-                        "attn.norm_k.weight": "img_attn.norm.key_norm.scale",
-                        "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",
-                        "attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale",
-                    }
-
-        for k in block_map:
-            key_map["{}.{}".format(prefix_from, k)] = "{}.{}".format(prefix_to, block_map[k])
-
-    for index in range(n_single_layers):
-        prefix_from = "single_transformer_blocks.{}".format(index)
-        prefix_to = "{}single_blocks.{}".format(output_prefix, index)
-
-        for end in ("weight", "bias"):
-            k = "{}.attn.".format(prefix_from)
-            qkv = "{}.linear1.{}".format(prefix_to, end)
-            key_map["{}to_q.{}".format(k, end)] = (qkv, (0, 0, hidden_size))
-            key_map["{}to_k.{}".format(k, end)] = (qkv, (0, hidden_size, hidden_size))
-            key_map["{}to_v.{}".format(k, end)] = (qkv, (0, hidden_size * 2, hidden_size))
-            key_map["{}.proj_mlp.{}".format(prefix_from, end)] = (qkv, (0, hidden_size * 3, hidden_size * 4))
-
-        block_map = {
-                        "norm.linear.weight": "modulation.lin.weight",
-                        "norm.linear.bias": "modulation.lin.bias",
-                        "proj_out.weight": "linear2.weight",
-                        "proj_out.bias": "linear2.bias",
-                        "attn.norm_q.weight": "norm.query_norm.scale",
-                        "attn.norm_k.weight": "norm.key_norm.scale",
-                    }
-
-        for k in block_map:
-            key_map["{}.{}".format(prefix_from, k)] = "{}.{}".format(prefix_to, block_map[k])
-
-    MAP_BASIC = {
-        ("final_layer.linear.bias", "proj_out.bias"),
-        ("final_layer.linear.weight", "proj_out.weight"),
-        ("img_in.bias", "x_embedder.bias"),
-        ("img_in.weight", "x_embedder.weight"),
-        ("time_in.in_layer.bias", "time_text_embed.timestep_embedder.linear_1.bias"),
-        ("time_in.in_layer.weight", "time_text_embed.timestep_embedder.linear_1.weight"),
-        ("time_in.out_layer.bias", "time_text_embed.timestep_embedder.linear_2.bias"),
-        ("time_in.out_layer.weight", "time_text_embed.timestep_embedder.linear_2.weight"),
-        ("txt_in.bias", "context_embedder.bias"),
-        ("txt_in.weight", "context_embedder.weight"),
-        ("vector_in.in_layer.bias", "time_text_embed.text_embedder.linear_1.bias"),
-        ("vector_in.in_layer.weight", "time_text_embed.text_embedder.linear_1.weight"),
-        ("vector_in.out_layer.bias", "time_text_embed.text_embedder.linear_2.bias"),
-        ("vector_in.out_layer.weight", "time_text_embed.text_embedder.linear_2.weight"),
-        ("guidance_in.in_layer.bias", "time_text_embed.guidance_embedder.linear_1.bias"),
-        ("guidance_in.in_layer.weight", "time_text_embed.guidance_embedder.linear_1.weight"),
-        ("guidance_in.out_layer.bias", "time_text_embed.guidance_embedder.linear_2.bias"),
-        ("guidance_in.out_layer.weight", "time_text_embed.guidance_embedder.linear_2.weight"),
-        ("final_layer.adaLN_modulation.1.bias", "norm_out.linear.bias", swap_scale_shift),
-        ("final_layer.adaLN_modulation.1.weight", "norm_out.linear.weight", swap_scale_shift),
-    }
-
-    for k in MAP_BASIC:
-        if len(k) > 2:
-            key_map[k[1]] = ("{}{}".format(output_prefix, k[0]), None, k[2])
-        else:
-            key_map[k[1]] = "{}{}".format(output_prefix, k[0])
-
-    return key_map
-
 def repeat_to_batch_size(tensor, batch_size, dim=0):
    if tensor.shape[dim] > batch_size:
        return tensor.narrow(dim, 0, batch_size)
--- a/comfy_execution/caching.py
+++ b/comfy_execution/caching.py
@@ -1,308 +0,0 @@
-import itertools
-from typing import Sequence, Mapping
-from comfy_execution.graph import DynamicPrompt
-
-import nodes
-
-from comfy_execution.graph_utils import is_link
-
-class CacheKeySet:
-    def __init__(self, dynprompt, node_ids, is_changed_cache):
-        self.keys = {}
-        self.subcache_keys = {}
-
-    def add_keys(self, node_ids):
-        raise NotImplementedError()
-
-    def all_node_ids(self):
-        return set(self.keys.keys())
-
-    def get_used_keys(self):
-        return self.keys.values()
-
-    def get_used_subcache_keys(self):
-        return self.subcache_keys.values()
-
-    def get_data_key(self, node_id):
-        return self.keys.get(node_id, None)
-
-    def get_subcache_key(self, node_id):
-        return self.subcache_keys.get(node_id, None)
-
-class Unhashable:
-    def __init__(self):
-        self.value = float("NaN")
-
-def to_hashable(obj):
-    # So that we don't infinitely recurse since frozenset and tuples
-    # are Sequences.
-    if isinstance(obj, (int, float, str, bool, type(None))):
-        return obj
-    elif isinstance(obj, Mapping):
-        return frozenset([(to_hashable(k), to_hashable(v)) for k, v in sorted(obj.items())])
-    elif isinstance(obj, Sequence):
-        return frozenset(zip(itertools.count(), [to_hashable(i) for i in obj]))
-    else:
-        # TODO - Support other objects like tensors?
-        return Unhashable()
-
-class CacheKeySetID(CacheKeySet):
-    def __init__(self, dynprompt, node_ids, is_changed_cache):
-        super().__init__(dynprompt, node_ids, is_changed_cache)
-        self.dynprompt = dynprompt
-        self.add_keys(node_ids)
-
-    def add_keys(self, node_ids):
-        for node_id in node_ids:
-            if node_id in self.keys:
-                continue
-            if not self.dynprompt.has_node(node_id):
-                continue
-            node = self.dynprompt.get_node(node_id)
-            self.keys[node_id] = (node_id, node["class_type"])
-            self.subcache_keys[node_id] = (node_id, node["class_type"])
-
-class CacheKeySetInputSignature(CacheKeySet):
-    def __init__(self, dynprompt, node_ids, is_changed_cache):
-        super().__init__(dynprompt, node_ids, is_changed_cache)
-        self.dynprompt = dynprompt
-        self.is_changed_cache = is_changed_cache
-        self.add_keys(node_ids)
-
-    def include_node_id_in_input(self) -> bool:
-        return False
-
-    def add_keys(self, node_ids):
-        for node_id in node_ids:
-            if node_id in self.keys:
-                continue
-            if not self.dynprompt.has_node(node_id):
-                continue
-            node = self.dynprompt.get_node(node_id)
-            self.keys[node_id] = self.get_node_signature(self.dynprompt, node_id)
-            self.subcache_keys[node_id] = (node_id, node["class_type"])
-
-    def get_node_signature(self, dynprompt, node_id):
-        signature = []
-        ancestors, order_mapping = self.get_ordered_ancestry(dynprompt, node_id)
-        signature.append(self.get_immediate_node_signature(dynprompt, node_id, order_mapping))
-        for ancestor_id in ancestors:
-            signature.append(self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping))
-        return to_hashable(signature)
-
-    def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping):
-        if not dynprompt.has_node(node_id):
-            # This node doesn't exist -- we can't cache it.
-            return [float("NaN")]
-        node = dynprompt.get_node(node_id)
-        class_type = node["class_type"]
-        class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-        signature = [class_type, self.is_changed_cache.get(node_id)]
-        if self.include_node_id_in_input() or (hasattr(class_def, "NOT_IDEMPOTENT") and class_def.NOT_IDEMPOTENT):
-            signature.append(node_id)
-        inputs = node["inputs"]
-        for key in sorted(inputs.keys()):
-            if is_link(inputs[key]):
-                (ancestor_id, ancestor_socket) = inputs[key]
-                ancestor_index = ancestor_order_mapping[ancestor_id]
-                signature.append((key,("ANCESTOR", ancestor_index, ancestor_socket)))
-            else:
-                signature.append((key, inputs[key]))
-        return signature
-
-    # This function returns a list of all ancestors of the given node. The order of the list is
-    # deterministic based on which specific inputs the ancestor is connected by.
-    def get_ordered_ancestry(self, dynprompt, node_id):
-        ancestors = []
-        order_mapping = {}
-        self.get_ordered_ancestry_internal(dynprompt, node_id, ancestors, order_mapping)
-        return ancestors, order_mapping
-
-    def get_ordered_ancestry_internal(self, dynprompt, node_id, ancestors, order_mapping):
-        if not dynprompt.has_node(node_id):
-            return
-        inputs = dynprompt.get_node(node_id)["inputs"]
-        input_keys = sorted(inputs.keys())
-        for key in input_keys:
-            if is_link(inputs[key]):
-                ancestor_id = inputs[key][0]
-                if ancestor_id not in order_mapping:
-                    ancestors.append(ancestor_id)
-                    order_mapping[ancestor_id] = len(ancestors) - 1
-                    self.get_ordered_ancestry_internal(dynprompt, ancestor_id, ancestors, order_mapping)
-
-class BasicCache:
-    def __init__(self, key_class):
-        self.key_class = key_class
-        self.initialized = False
-        self.dynprompt: DynamicPrompt
-        self.cache_key_set: CacheKeySet
-        self.cache = {}
-        self.subcaches = {}
-
-    def set_prompt(self, dynprompt, node_ids, is_changed_cache):
-        self.dynprompt = dynprompt
-        self.cache_key_set = self.key_class(dynprompt, node_ids, is_changed_cache)
-        self.is_changed_cache = is_changed_cache
-        self.initialized = True
-
-    def all_node_ids(self):
-        assert self.initialized
-        node_ids = self.cache_key_set.all_node_ids()
-        for subcache in self.subcaches.values():
-            node_ids = node_ids.union(subcache.all_node_ids())
-        return node_ids
-
-    def _clean_cache(self):
-        preserve_keys = set(self.cache_key_set.get_used_keys())
-        to_remove = []
-        for key in self.cache:
-            if key not in preserve_keys:
-                to_remove.append(key)
-        for key in to_remove:
-            del self.cache[key]
-
-    def _clean_subcaches(self):
-        preserve_subcaches = set(self.cache_key_set.get_used_subcache_keys())
-
-        to_remove = []
-        for key in self.subcaches:
-            if key not in preserve_subcaches:
-                to_remove.append(key)
-        for key in to_remove:
-            del self.subcaches[key]
-
-    def clean_unused(self):
-        assert self.initialized
-        self._clean_cache()
-        self._clean_subcaches()
-
-    def _set_immediate(self, node_id, value):
-        assert self.initialized
-        cache_key = self.cache_key_set.get_data_key(node_id)
-        self.cache[cache_key] = value
-
-    def _get_immediate(self, node_id):
-        if not self.initialized:
-            return None
-        cache_key = self.cache_key_set.get_data_key(node_id)
-        if cache_key in self.cache:
-            return self.cache[cache_key]
-        else:
-            return None
-
-    def _ensure_subcache(self, node_id, children_ids):
-        subcache_key = self.cache_key_set.get_subcache_key(node_id)
-        subcache = self.subcaches.get(subcache_key, None)
-        if subcache is None:
-            subcache = BasicCache(self.key_class)
-            self.subcaches[subcache_key] = subcache
-        subcache.set_prompt(self.dynprompt, children_ids, self.is_changed_cache)
-        return subcache
-
-    def _get_subcache(self, node_id):
-        assert self.initialized
-        subcache_key = self.cache_key_set.get_subcache_key(node_id)
-        if subcache_key in self.subcaches:
-            return self.subcaches[subcache_key]
-        else:
-            return None
-
-    def recursive_debug_dump(self):
-        result = []
-        for key in self.cache:
-            result.append({"key": key, "value": self.cache[key]})
-        for key in self.subcaches:
-            result.append({"subcache_key": key, "subcache": self.subcaches[key].recursive_debug_dump()})
-        return result
-
-class HierarchicalCache(BasicCache):
-    def __init__(self, key_class):
-        super().__init__(key_class)
-
-    def _get_cache_for(self, node_id):
-        assert self.dynprompt is not None
-        parent_id = self.dynprompt.get_parent_node_id(node_id)
-        if parent_id is None:
-            return self
-
-        hierarchy = []
-        while parent_id is not None:
-            hierarchy.append(parent_id)
-            parent_id = self.dynprompt.get_parent_node_id(parent_id)
-
-        cache = self
-        for parent_id in reversed(hierarchy):
-            cache = cache._get_subcache(parent_id)
-            if cache is None:
-                return None
-        return cache
-
-    def get(self, node_id):
-        cache = self._get_cache_for(node_id)
-        if cache is None:
-            return None
-        return cache._get_immediate(node_id)
-
-    def set(self, node_id, value):
-        cache = self._get_cache_for(node_id)
-        assert cache is not None
-        cache._set_immediate(node_id, value)
-
-    def ensure_subcache_for(self, node_id, children_ids):
-        cache = self._get_cache_for(node_id)
-        assert cache is not None
-        return cache._ensure_subcache(node_id, children_ids)
-
-class LRUCache(BasicCache):
-    def __init__(self, key_class, max_size=100):
-        super().__init__(key_class)
-        self.max_size = max_size
-        self.min_generation = 0
-        self.generation = 0
-        self.used_generation = {}
-        self.children = {}
-
-    def set_prompt(self, dynprompt, node_ids, is_changed_cache):
-        super().set_prompt(dynprompt, node_ids, is_changed_cache)
-        self.generation += 1
-        for node_id in node_ids:
-            self._mark_used(node_id)
-
-    def clean_unused(self):
-        while len(self.cache) > self.max_size and self.min_generation < self.generation:
-            self.min_generation += 1
-            to_remove = [key for key in self.cache if self.used_generation[key] < self.min_generation]
-            for key in to_remove:
-                del self.cache[key]
-                del self.used_generation[key]
-                if key in self.children:
-                    del self.children[key]
-        self._clean_subcaches()
-
-    def get(self, node_id):
-        self._mark_used(node_id)
-        return self._get_immediate(node_id)
-
-    def _mark_used(self, node_id):
-        cache_key = self.cache_key_set.get_data_key(node_id)
-        if cache_key is not None:
-            self.used_generation[cache_key] = self.generation
-
-    def set(self, node_id, value):
-        self._mark_used(node_id)
-        return self._set_immediate(node_id, value)
-
-    def ensure_subcache_for(self, node_id, children_ids):
-        # Just uses subcaches for tracking 'live' nodes
-        super()._ensure_subcache(node_id, children_ids)
-
-        self.cache_key_set.add_keys(children_ids)
-        self._mark_used(node_id)
-        cache_key = self.cache_key_set.get_data_key(node_id)
-        self.children[cache_key] = []
-        for child_id in children_ids:
-            self._mark_used(child_id)
-            self.children[cache_key].append(self.cache_key_set.get_data_key(child_id))
-        return self
-
--- a/comfy_execution/graph.py
+++ b/comfy_execution/graph.py
@@ -1,259 +0,0 @@
-import nodes
-
-from comfy_execution.graph_utils import is_link
-
-class DependencyCycleError(Exception):
-    pass
-
-class NodeInputError(Exception):
-    pass
-
-class NodeNotFoundError(Exception):
-    pass
-
-class DynamicPrompt:
-    def __init__(self, original_prompt):
-        # The original prompt provided by the user
-        self.original_prompt = original_prompt
-        # Any extra pieces of the graph created during execution
-        self.ephemeral_prompt = {}
-        self.ephemeral_parents = {}
-        self.ephemeral_display = {}
-
-    def get_node(self, node_id):
-        if node_id in self.ephemeral_prompt:
-            return self.ephemeral_prompt[node_id]
-        if node_id in self.original_prompt:
-            return self.original_prompt[node_id]
-        raise NodeNotFoundError(f"Node {node_id} not found")
-
-    def has_node(self, node_id):
-        return node_id in self.original_prompt or node_id in self.ephemeral_prompt
-
-    def add_ephemeral_node(self, node_id, node_info, parent_id, display_id):
-        self.ephemeral_prompt[node_id] = node_info
-        self.ephemeral_parents[node_id] = parent_id
-        self.ephemeral_display[node_id] = display_id
-
-    def get_real_node_id(self, node_id):
-        while node_id in self.ephemeral_parents:
-            node_id = self.ephemeral_parents[node_id]
-        return node_id
-
-    def get_parent_node_id(self, node_id):
-        return self.ephemeral_parents.get(node_id, None)
-
-    def get_display_node_id(self, node_id):
-        while node_id in self.ephemeral_display:
-            node_id = self.ephemeral_display[node_id]
-        return node_id
-
-    def all_node_ids(self):
-        return set(self.original_prompt.keys()).union(set(self.ephemeral_prompt.keys()))
-
-    def get_original_prompt(self):
-        return self.original_prompt
-
-def get_input_info(class_def, input_name):
-    valid_inputs = class_def.INPUT_TYPES()
-    input_info = None
-    input_category = None
-    if "required" in valid_inputs and input_name in valid_inputs["required"]:
-        input_category = "required"
-        input_info = valid_inputs["required"][input_name]
-    elif "optional" in valid_inputs and input_name in valid_inputs["optional"]:
-        input_category = "optional"
-        input_info = valid_inputs["optional"][input_name]
-    elif "hidden" in valid_inputs and input_name in valid_inputs["hidden"]:
-        input_category = "hidden"
-        input_info = valid_inputs["hidden"][input_name]
-    if input_info is None:
-        return None, None, None
-    input_type = input_info[0]
-    if len(input_info) > 1:
-        extra_info = input_info[1]
-    else:
-        extra_info = {}
-    return input_type, input_category, extra_info
-
-class TopologicalSort:
-    def __init__(self, dynprompt):
-        self.dynprompt = dynprompt
-        self.pendingNodes = {}
-        self.blockCount = {} # Number of nodes this node is directly blocked by
-        self.blocking = {} # Which nodes are blocked by this node
-
-    def get_input_info(self, unique_id, input_name):
-        class_type = self.dynprompt.get_node(unique_id)["class_type"]
-        class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-        return get_input_info(class_def, input_name)
-
-    def make_input_strong_link(self, to_node_id, to_input):
-        inputs = self.dynprompt.get_node(to_node_id)["inputs"]
-        if to_input not in inputs:
-            raise NodeInputError(f"Node {to_node_id} says it needs input {to_input}, but there is no input to that node at all")
-        value = inputs[to_input]
-        if not is_link(value):
-            raise NodeInputError(f"Node {to_node_id} says it needs input {to_input}, but that value is a constant")
-        from_node_id, from_socket = value
-        self.add_strong_link(from_node_id, from_socket, to_node_id)
-
-    def add_strong_link(self, from_node_id, from_socket, to_node_id):
-        self.add_node(from_node_id)
-        if to_node_id not in self.blocking[from_node_id]:
-            self.blocking[from_node_id][to_node_id] = {}
-            self.blockCount[to_node_id] += 1
-        self.blocking[from_node_id][to_node_id][from_socket] = True
-
-    def add_node(self, unique_id, include_lazy=False, subgraph_nodes=None):
-        if unique_id in self.pendingNodes:
-            return
-        self.pendingNodes[unique_id] = True
-        self.blockCount[unique_id] = 0
-        self.blocking[unique_id] = {}
-
-        inputs = self.dynprompt.get_node(unique_id)["inputs"]
-        for input_name in inputs:
-            value = inputs[input_name]
-            if is_link(value):
-                from_node_id, from_socket = value
-                if subgraph_nodes is not None and from_node_id not in subgraph_nodes:
-                    continue
-                input_type, input_category, input_info = self.get_input_info(unique_id, input_name)
-                is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"]
-                if include_lazy or not is_lazy:
-                    self.add_strong_link(from_node_id, from_socket, unique_id)
-
-    def get_ready_nodes(self):
-        return [node_id for node_id in self.pendingNodes if self.blockCount[node_id] == 0]
-
-    def pop_node(self, unique_id):
-        del self.pendingNodes[unique_id]
-        for blocked_node_id in self.blocking[unique_id]:
-            self.blockCount[blocked_node_id] -= 1
-        del self.blocking[unique_id]
-
-    def is_empty(self):
-        return len(self.pendingNodes) == 0
-
-class ExecutionList(TopologicalSort):
-    """
-    ExecutionList implements a topological dissolve of the graph. After a node is staged for execution,
-    it can still be returned to the graph after having further dependencies added.
-    """
-    def __init__(self, dynprompt, output_cache):
-        super().__init__(dynprompt)
-        self.output_cache = output_cache
-        self.staged_node_id = None
-
-    def add_strong_link(self, from_node_id, from_socket, to_node_id):
-        if self.output_cache.get(from_node_id) is not None:
-            # Nothing to do
-            return
-        super().add_strong_link(from_node_id, from_socket, to_node_id)
-
-    def stage_node_execution(self):
-        assert self.staged_node_id is None
-        if self.is_empty():
-            return None, None, None
-        available = self.get_ready_nodes()
-        if len(available) == 0:
-            cycled_nodes = self.get_nodes_in_cycle()
-            # Because cycles composed entirely of static nodes are caught during initial validation,
-            # we will 'blame' the first node in the cycle that is not a static node.
-            blamed_node = cycled_nodes[0]
-            for node_id in cycled_nodes:
-                display_node_id = self.dynprompt.get_display_node_id(node_id)
-                if display_node_id != node_id:
-                    blamed_node = display_node_id
-                    break
-            ex = DependencyCycleError("Dependency cycle detected")
-            error_details = {
-                "node_id": blamed_node,
-                "exception_message": str(ex),
-                "exception_type": "graph.DependencyCycleError",
-                "traceback": [],
-                "current_inputs": []
-            }
-            return None, error_details, ex
-
-        self.staged_node_id = self.ux_friendly_pick_node(available)
-        return self.staged_node_id, None, None
-
-    def ux_friendly_pick_node(self, node_list):
-        # If an output node is available, do that first.
-        # Technically this has no effect on the overall length of execution, but it feels better as a user
-        # for a PreviewImage to display a result as soon as it can
-        # Some other heuristics could probably be used here to improve the UX further.
-        def is_output(node_id):
-            class_type = self.dynprompt.get_node(node_id)["class_type"]
-            class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-            if hasattr(class_def, 'OUTPUT_NODE') and class_def.OUTPUT_NODE == True:
-                return True
-            return False
-
-        for node_id in node_list:
-            if is_output(node_id):
-                return node_id
-
-        #This should handle the VAEDecode -> preview case
-        for node_id in node_list:
-            for blocked_node_id in self.blocking[node_id]:
-                if is_output(blocked_node_id):
-                    return node_id
-
-        #This should handle the VAELoader -> VAEDecode -> preview case
-        for node_id in node_list:
-            for blocked_node_id in self.blocking[node_id]:
-                for blocked_node_id1 in self.blocking[blocked_node_id]:
-                    if is_output(blocked_node_id1):
-                        return node_id
-
-        #TODO: this function should be improved
-        return node_list[0]
-
-    def unstage_node_execution(self):
-        assert self.staged_node_id is not None
-        self.staged_node_id = None
-
-    def complete_node_execution(self):
-        node_id = self.staged_node_id
-        self.pop_node(node_id)
-        self.staged_node_id = None
-
-    def get_nodes_in_cycle(self):
-        # We'll dissolve the graph in reverse topological order to leave only the nodes in the cycle.
-        # We're skipping some of the performance optimizations from the original TopologicalSort to keep
-        # the code simple (and because having a cycle in the first place is a catastrophic error)
-        blocked_by = { node_id: {} for node_id in self.pendingNodes }
-        for from_node_id in self.blocking:
-            for to_node_id in self.blocking[from_node_id]:
-                if True in self.blocking[from_node_id][to_node_id].values():
-                    blocked_by[to_node_id][from_node_id] = True
-        to_remove = [node_id for node_id in blocked_by if len(blocked_by[node_id]) == 0]
-        while len(to_remove) > 0:
-            for node_id in to_remove:
-                for to_node_id in blocked_by:
-                    if node_id in blocked_by[to_node_id]:
-                        del blocked_by[to_node_id][node_id]
-                del blocked_by[node_id]
-            to_remove = [node_id for node_id in blocked_by if len(blocked_by[node_id]) == 0]
-        return list(blocked_by.keys())
-
-class ExecutionBlocker:
-    """
-    Return this from a node and any users will be blocked with the given error message.
-    If the message is None, execution will be blocked silently instead.
-    Generally, you should avoid using this functionality unless absolutely necessary. Whenever it's
-    possible, a lazy input will be more efficient and have a better user experience.
-    This functionality is useful in two cases:
-    1. You want to conditionally prevent an output node from executing. (Particularly a built-in node
-       like SaveImage. For your own output nodes, I would recommend just adding a BOOL input and using
-       lazy evaluation to let it conditionally disable itself.)
-    2. You have a node with multiple possible outputs, some of which are invalid and should not be used.
-       (I would recommend not making nodes like this in the future -- instead, make multiple nodes with
-       different outputs. Unfortunately, there are several popular existing nodes using this pattern.)
-    """
-    def __init__(self, message):
-        self.message = message
-
--- a/comfy_execution/graph_utils.py
+++ b/comfy_execution/graph_utils.py
@@ -1,139 +0,0 @@
-def is_link(obj):
-    if not isinstance(obj, list):
-        return False
-    if len(obj) != 2:
-        return False
-    if not isinstance(obj[0], str):
-        return False
-    if not isinstance(obj[1], int) and not isinstance(obj[1], float):
-        return False
-    return True
-
-# The GraphBuilder is just a utility class that outputs graphs in the form expected by the ComfyUI back-end
-class GraphBuilder:
-    _default_prefix_root = ""
-    _default_prefix_call_index = 0
-    _default_prefix_graph_index = 0
-
-    def __init__(self, prefix = None):
-        if prefix is None:
-            self.prefix = GraphBuilder.alloc_prefix()
-        else:
-            self.prefix = prefix
-        self.nodes = {}
-        self.id_gen = 1
-
-    @classmethod
-    def set_default_prefix(cls, prefix_root, call_index, graph_index = 0):
-        cls._default_prefix_root = prefix_root
-        cls._default_prefix_call_index = call_index
-        cls._default_prefix_graph_index = graph_index
-
-    @classmethod
-    def alloc_prefix(cls, root=None, call_index=None, graph_index=None):
-        if root is None:
-            root = GraphBuilder._default_prefix_root
-        if call_index is None:
-            call_index = GraphBuilder._default_prefix_call_index
-        if graph_index is None:
-            graph_index = GraphBuilder._default_prefix_graph_index
-        result = f"{root}.{call_index}.{graph_index}."
-        GraphBuilder._default_prefix_graph_index += 1
-        return result
-
-    def node(self, class_type, id=None, **kwargs):
-        if id is None:
-            id = str(self.id_gen)
-            self.id_gen += 1
-        id = self.prefix + id
-        if id in self.nodes:
-            return self.nodes[id]
-
-        node = Node(id, class_type, kwargs)
-        self.nodes[id] = node
-        return node
-
-    def lookup_node(self, id):
-        id = self.prefix + id
-        return self.nodes.get(id)
-
-    def finalize(self):
-        output = {}
-        for node_id, node in self.nodes.items():
-            output[node_id] = node.serialize()
-        return output
-
-    def replace_node_output(self, node_id, index, new_value):
-        node_id = self.prefix + node_id
-        to_remove = []
-        for node in self.nodes.values():
-            for key, value in node.inputs.items():
-                if is_link(value) and value[0] == node_id and value[1] == index:
-                    if new_value is None:
-                        to_remove.append((node, key))
-                    else:
-                        node.inputs[key] = new_value
-        for node, key in to_remove:
-            del node.inputs[key]
-
-    def remove_node(self, id):
-        id = self.prefix + id
-        del self.nodes[id]
-
-class Node:
-    def __init__(self, id, class_type, inputs):
-        self.id = id
-        self.class_type = class_type
-        self.inputs = inputs
-        self.override_display_id = None
-
-    def out(self, index):
-        return [self.id, index]
-
-    def set_input(self, key, value):
-        if value is None:
-            if key in self.inputs:
-                del self.inputs[key]
-        else:
-            self.inputs[key] = value
-
-    def get_input(self, key):
-        return self.inputs.get(key)
-
-    def set_override_display_id(self, override_display_id):
-        self.override_display_id = override_display_id
-
-    def serialize(self):
-        serialized = {
-            "class_type": self.class_type,
-            "inputs": self.inputs
-        }
-        if self.override_display_id is not None:
-            serialized["override_display_id"] = self.override_display_id
-        return serialized
-
-def add_graph_prefix(graph, outputs, prefix):
-    # Change the node IDs and any internal links
-    new_graph = {}
-    for node_id, node_info in graph.items():
-        # Make sure the added nodes have unique IDs
-        new_node_id = prefix + node_id
-        new_node = { "class_type": node_info["class_type"], "inputs": {} }
-        for input_name, input_value in node_info.get("inputs", {}).items():
-            if is_link(input_value):
-                new_node["inputs"][input_name] = [prefix + input_value[0], input_value[1]]
-            else:
-                new_node["inputs"][input_name] = input_value
-        new_graph[new_node_id] = new_node
-
-    # Change the node IDs in the outputs
-    new_outputs = []
-    for n in range(len(outputs)):
-        output = outputs[n]
-        if is_link(output):
-            new_outputs.append([prefix + output[0], output[1]])
-        else:
-            new_outputs.append(output)
-
-    return new_graph, tuple(new_outputs)
-
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -7,7 +7,6 @@ import io
 import json
 import struct
 import random
-import hashlib
 from comfy.cli_args import args

 class EmptyLatentAudio:
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -295,23 +295,6 @@ class SamplerDPMPP_SDE:
        sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise, "r": r})
        return (sampler, )

-class SamplerDPMPP_2S_Ancestral:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
-
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, eta, s_noise):
-        sampler = comfy.samplers.ksampler("dpmpp_2s_ancestral", {"eta": eta, "s_noise": s_noise})
-        return (sampler, )
-
 class SamplerEulerAncestral:
    @classmethod
    def INPUT_TYPES(s):
@@ -683,7 +666,6 @@ NODE_CLASS_MAPPINGS = {
    "SamplerDPMPP_3M_SDE": SamplerDPMPP_3M_SDE,
    "SamplerDPMPP_2M_SDE": SamplerDPMPP_2M_SDE,
    "SamplerDPMPP_SDE": SamplerDPMPP_SDE,
-    "SamplerDPMPP_2S_Ancestral": SamplerDPMPP_2S_Ancestral,
    "SamplerDPMAdaptative": SamplerDPMAdaptative,
    "SplitSigmas": SplitSigmas,
    "SplitSigmasDenoise": SplitSigmasDenoise,
@@ -700,4 +682,4 @@ NODE_CLASS_MAPPINGS = {

 NODE_DISPLAY_NAME_MAPPINGS = {
    "SamplerEulerAncestralCFGPP": "SamplerEulerAncestralCFG++",
-}
+}
--- a/comfy_extras/nodes_flux.py
+++ b/comfy_extras/nodes_flux.py
@@ -1,47 +0,0 @@
-import node_helpers
-
-class CLIPTextEncodeFlux:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "clip": ("CLIP", ),
-            "clip_l": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            "t5xxl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            "guidance": ("FLOAT", {"default": 3.5, "min": 0.0, "max": 100.0, "step": 0.1}),
-            }}
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "encode"
-
-    CATEGORY = "advanced/conditioning/flux"
-
-    def encode(self, clip, clip_l, t5xxl, guidance):
-        tokens = clip.tokenize(clip_l)
-        tokens["t5xxl"] = clip.tokenize(t5xxl)["t5xxl"]
-
-        output = clip.encode_from_tokens(tokens, return_pooled=True, return_dict=True)
-        cond = output.pop("cond")
-        output["guidance"] = guidance
-        return ([[cond, output]], )
-
-class FluxGuidance:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "conditioning": ("CONDITIONING", ),
-            "guidance": ("FLOAT", {"default": 3.5, "min": 0.0, "max": 100.0, "step": 0.1}),
-            }}
-
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "append"
-
-    CATEGORY = "advanced/conditioning/flux"
-
-    def append(self, conditioning, guidance):
-        c = node_helpers.conditioning_set_values(conditioning, {"guidance": guidance})
-        return (c, )
-
-
-NODE_CLASS_MAPPINGS = {
-    "CLIPTextEncodeFlux": CLIPTextEncodeFlux,
-    "FluxGuidance": FluxGuidance,
-}
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@@ -1,25 +0,0 @@
-class CLIPTextEncodeHunyuanDiT:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "clip": ("CLIP", ),
-            "bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            "mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            }}
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "encode"
-
-    CATEGORY = "advanced/conditioning"
-
-    def encode(self, clip, bert, mt5xl):
-        tokens = clip.tokenize(bert)
-        tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]
-
-        output = clip.encode_from_tokens(tokens, return_pooled=True, return_dict=True)
-        cond = output.pop("cond")
-        return ([[cond, output]], )
-
-
-NODE_CLASS_MAPPINGS = {
-    "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
-}
--- a/comfy_extras/nodes_model_advanced.py
+++ b/comfy_extras/nodes_model_advanced.py
@@ -2,7 +2,6 @@ import folder_paths
 import comfy.sd
 import comfy.model_sampling
 import comfy.latent_formats
-import nodes
 import torch

 class LCM(comfy.model_sampling.EPS):
@@ -171,42 +170,6 @@ class ModelSamplingAuraFlow(ModelSamplingSD3):
    def patch_aura(self, model, shift):
        return self.patch(model, shift, multiplier=1.0)

-class ModelSamplingFlux:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "model": ("MODEL",),
-                              "max_shift": ("FLOAT", {"default": 1.15, "min": 0.0, "max": 100.0, "step":0.01}),
-                              "base_shift": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 100.0, "step":0.01}),
-                              "width": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}),
-                              "height": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}),
-                              }}
-
-    RETURN_TYPES = ("MODEL",)
-    FUNCTION = "patch"
-
-    CATEGORY = "advanced/model"
-
-    def patch(self, model, max_shift, base_shift, width, height):
-        m = model.clone()
-
-        x1 = 256
-        x2 = 4096
-        mm = (max_shift - base_shift) / (x2 - x1)
-        b = base_shift - mm * x1
-        shift = (width * height / (8 * 8 * 2 * 2)) * mm + b
-
-        sampling_base = comfy.model_sampling.ModelSamplingFlux
-        sampling_type = comfy.model_sampling.CONST
-
-        class ModelSamplingAdvanced(sampling_base, sampling_type):
-            pass
-
-        model_sampling = ModelSamplingAdvanced(model.model.model_config)
-        model_sampling.set_parameters(shift=shift)
-        m.add_object_patch("model_sampling", model_sampling)
-        return (m, )
-
-
 class ModelSamplingContinuousEDM:
    @classmethod
    def INPUT_TYPES(s):
@@ -321,6 +284,5 @@ NODE_CLASS_MAPPINGS = {
    "ModelSamplingStableCascade": ModelSamplingStableCascade,
    "ModelSamplingSD3": ModelSamplingSD3,
    "ModelSamplingAuraFlow": ModelSamplingAuraFlow,
-    "ModelSamplingFlux": ModelSamplingFlux,
    "RescaleCFG": RescaleCFG,
 }
--- a/comfy_extras/nodes_model_merging.py
+++ b/comfy_extras/nodes_model_merging.py
@@ -264,7 +264,6 @@ class CLIPSave:

        metadata = {}
        if not args.disable_metadata:
-            metadata["format"] = "pt"
            metadata["prompt"] = prompt_info
            if extra_pnginfo is not None:
                for x in extra_pnginfo:
@@ -333,25 +332,6 @@ class VAESave:
        comfy.utils.save_torch_file(vae.get_sd(), output_checkpoint, metadata=metadata)
        return {}

-class ModelSave:
-    def __init__(self):
-        self.output_dir = folder_paths.get_output_directory()
-
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "model": ("MODEL",),
-                              "filename_prefix": ("STRING", {"default": "diffusion_models/ComfyUI"}),},
-                "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"},}
-    RETURN_TYPES = ()
-    FUNCTION = "save"
-    OUTPUT_NODE = True
-
-    CATEGORY = "advanced/model_merging"
-
-    def save(self, model, filename_prefix, prompt=None, extra_pnginfo=None):
-        save_checkpoint(model, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
-        return {}
-
 NODE_CLASS_MAPPINGS = {
    "ModelMergeSimple": ModelMergeSimple,
    "ModelMergeBlocks": ModelMergeBlocks,
@@ -363,9 +343,4 @@ NODE_CLASS_MAPPINGS = {
    "CLIPMergeAdd": CLIPAdd,
    "CLIPSave": CLIPSave,
    "VAESave": VAESave,
-    "ModelSave": ModelSave,
-}
-
-NODE_DISPLAY_NAME_MAPPINGS = {
-    "CheckpointSave": "Save Checkpoint",
 }
--- a/comfy_extras/nodes_model_merging_model_specific.py
+++ b/comfy_extras/nodes_model_merging_model_specific.py
@@ -75,36 +75,9 @@ class ModelMergeSD3_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):

        return {"required": arg_dict}

-class ModelMergeFlux1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
-    CATEGORY = "advanced/model_merging/model_specific"
-
-    @classmethod
-    def INPUT_TYPES(s):
-        arg_dict = { "model1": ("MODEL",),
-                              "model2": ("MODEL",)}
-
-        argument = ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01})
-
-        arg_dict["img_in."] = argument
-        arg_dict["time_in."] = argument
-        arg_dict["guidance_in"] = argument
-        arg_dict["vector_in."] = argument
-        arg_dict["txt_in."] = argument
-
-        for i in range(19):
-            arg_dict["double_blocks.{}.".format(i)] = argument
-
-        for i in range(38):
-            arg_dict["single_blocks.{}.".format(i)] = argument
-
-        arg_dict["final_layer."] = argument
-
-        return {"required": arg_dict}
-
 NODE_CLASS_MAPPINGS = {
    "ModelMergeSD1": ModelMergeSD1,
    "ModelMergeSD2": ModelMergeSD1, #SD1 and SD2 have the same blocks
    "ModelMergeSDXL": ModelMergeSDXL,
    "ModelMergeSD3_2B": ModelMergeSD3_2B,
-    "ModelMergeFlux1": ModelMergeFlux1,
 }
--- a/comfy_extras/nodes_pag.py
+++ b/comfy_extras/nodes_pag.py
@@ -12,7 +12,7 @@ class PerturbedAttentionGuidance:
        return {
            "required": {
                "model": ("MODEL",),
-                "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": 0.01}),
+                "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 100.0, "step": 0.1, "round": 0.01}),
            }
        }

--- a/comfy_extras/nodes_sag.py
+++ b/comfy_extras/nodes_sag.py
@@ -96,7 +96,7 @@ class SelfAttentionGuidance:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "model": ("MODEL",),
-                             "scale": ("FLOAT", {"default": 0.5, "min": -2.0, "max": 5.0, "step": 0.01}),
+                             "scale": ("FLOAT", {"default": 0.5, "min": -2.0, "max": 5.0, "step": 0.1}),
                             "blur_sigma": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 10.0, "step": 0.1}),
                              }}
    RETURN_TYPES = ("MODEL",)
--- a/comfy_extras/nodes_sd3.py
+++ b/comfy_extras/nodes_sd3.py
@@ -27,8 +27,8 @@ class EmptySD3LatentImage:

    @classmethod
    def INPUT_TYPES(s):
-        return {"required": { "width": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                              "height": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+        return {"required": { "width": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}),
+                              "height": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}),
                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
    RETURN_TYPES = ("LATENT",)
    FUNCTION = "generate"
@@ -100,8 +100,3 @@ NODE_CLASS_MAPPINGS = {
    "CLIPTextEncodeSD3": CLIPTextEncodeSD3,
    "ControlNetApplySD3": ControlNetApplySD3,
 }
-
-NODE_DISPLAY_NAME_MAPPINGS = {
-    # Sampling
-    "ControlNetApplySD3": "ControlNetApply SD3 and HunyuanDiT",
-}
--- a/custom_nodes/example_node.py.example
+++ b/custom_nodes/example_node.py.example
@@ -4,14 +4,14 @@ class Example:

    Class methods
    -------------
-    INPUT_TYPES (dict):
+    INPUT_TYPES (dict): 
        Tell the main program input parameters of nodes.
    IS_CHANGED:
        optional method to control when the node is re executed.

    Attributes
    ----------
-    RETURN_TYPES (`tuple`):
+    RETURN_TYPES (`tuple`): 
        The type of each element in the output tuple.
    RETURN_NAMES (`tuple`):
        Optional: The name of each output in the output tuple.
@@ -23,19 +23,13 @@ class Example:
        Assumed to be False if not present.
    CATEGORY (`str`):
        The category the node should appear in the UI.
-    DEPRECATED (`bool`):
-        Indicates whether the node is deprecated. Deprecated nodes are hidden by default in the UI, but remain
-        functional in existing workflows that use them.
-    EXPERIMENTAL (`bool`):
-        Indicates whether the node is experimental. Experimental nodes are marked as such in the UI and may be subject to
-        significant changes or removal in future versions. Use with caution in production workflows.
    execute(s) -> tuple || None:
        The entry point method. The name of this method must be the same as the value of property `FUNCTION`.
        For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"` then it must be `foo`.
    """
    def __init__(self):
        pass
-
+    
    @classmethod
    def INPUT_TYPES(s):
        """
@@ -60,8 +54,7 @@ class Example:
                    "min": 0, #Minimum value
                    "max": 4096, #Maximum value
                    "step": 64, #Slider's step
-                    "display": "number", # Cosmetic only: display as "number" or "slider"
-                    "lazy": True # Will only be evaluated if check_lazy_status requires it
+                    "display": "number" # Cosmetic only: display as "number" or "slider"
                }),
                "float_field": ("FLOAT", {
                    "default": 1.0,
@@ -69,14 +62,11 @@ class Example:
                    "max": 10.0,
                    "step": 0.01,
                    "round": 0.001, #The value representing the precision to round to, will be set to the step value by default. Can be set to False to disable rounding.
-                    "display": "number",
-                    "lazy": True
-                }),
+                    "display": "number"}),
                "print_to_screen": (["enable", "disable"],),
                "string_field": ("STRING", {
                    "multiline": False, #True if you want the field to look like the one on the ClipTextEncode node
-                    "default": "Hello World!",
-                    "lazy": True
+                    "default": "Hello World!"
                }),
            },
        }
@@ -90,23 +80,6 @@ class Example:

    CATEGORY = "Example"

-    def check_lazy_status(self, image, string_field, int_field, float_field, print_to_screen):
-        """
-            Return a list of input names that need to be evaluated.
-
-            This function will be called if there are any lazy inputs which have not yet been
-            evaluated. As long as you return at least one field which has not yet been evaluated
-            (and more exist), this function will be called again once the value of the requested
-            field is available.
-
-            Any evaluated inputs will be passed as arguments to this function. Any unevaluated
-            inputs will have the value None.
-        """
-        if print_to_screen == "enable":
-            return ["int_field", "float_field", "string_field"]
-        else:
-            return []
-
    def test(self, image, string_field, int_field, float_field, print_to_screen):
        if print_to_screen == "enable":
            print(f"""Your input contains:
--- a/execution.py
+++ b/execution.py
@@ -5,7 +5,6 @@ import threading
 import heapq
 import time
 import traceback
-from enum import Enum
 import inspect
 from typing import List, Literal, NamedTuple, Optional

@@ -13,219 +12,102 @@ import torch
 import nodes

 import comfy.model_management
-from comfy_execution.graph import get_input_info, ExecutionList, DynamicPrompt, ExecutionBlocker
-from comfy_execution.graph_utils import is_link, GraphBuilder
-from comfy_execution.caching import HierarchicalCache, LRUCache, CacheKeySetInputSignature, CacheKeySetID
-from comfy.cli_args import args

-class ExecutionResult(Enum):
-    SUCCESS = 0
-    FAILURE = 1
-    PENDING = 2
-
-class DuplicateNodeError(Exception):
-    pass
-
-class IsChangedCache:
-    def __init__(self, dynprompt, outputs_cache):
-        self.dynprompt = dynprompt
-        self.outputs_cache = outputs_cache
-        self.is_changed = {}
-
-    def get(self, node_id):
-        if node_id in self.is_changed:
-            return self.is_changed[node_id]
-
-        node = self.dynprompt.get_node(node_id)
-        class_type = node["class_type"]
-        class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-        if not hasattr(class_def, "IS_CHANGED"):
-            self.is_changed[node_id] = False
-            return self.is_changed[node_id]
-
-        if "is_changed" in node:
-            self.is_changed[node_id] = node["is_changed"]
-            return self.is_changed[node_id]
-
-        # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED
-        input_data_all, _ = get_input_data(node["inputs"], class_def, node_id, None)
-        try:
-            is_changed = _map_node_over_list(class_def, input_data_all, "IS_CHANGED")
-            node["is_changed"] = [None if isinstance(x, ExecutionBlocker) else x for x in is_changed]
-        except Exception as e:
-            logging.warning("WARNING: {}".format(e))
-            node["is_changed"] = float("NaN")
-        finally:
-            self.is_changed[node_id] = node["is_changed"]
-        return self.is_changed[node_id]
-
-class CacheSet:
-    def __init__(self, lru_size=None):
-        if lru_size is None or lru_size == 0:
-            self.init_classic_cache() 
-        else:
-            self.init_lru_cache(lru_size)
-        self.all = [self.outputs, self.ui, self.objects]
-
-    # Useful for those with ample RAM/VRAM -- allows experimenting without
-    # blowing away the cache every time
-    def init_lru_cache(self, cache_size):
-        self.outputs = LRUCache(CacheKeySetInputSignature, max_size=cache_size)
-        self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size)
-        self.objects = HierarchicalCache(CacheKeySetID)
-
-    # Performs like the old cache -- dump data ASAP
-    def init_classic_cache(self):
-        self.outputs = HierarchicalCache(CacheKeySetInputSignature)
-        self.ui = HierarchicalCache(CacheKeySetInputSignature)
-        self.objects = HierarchicalCache(CacheKeySetID)
-
-    def recursive_debug_dump(self):
-        result = {
-            "outputs": self.outputs.recursive_debug_dump(),
-            "ui": self.ui.recursive_debug_dump(),
-        }
-        return result
-
-def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}):
+def get_input_data(inputs, class_def, unique_id, outputs={}, prompt={}, extra_data={}):
    valid_inputs = class_def.INPUT_TYPES()
    input_data_all = {}
-    missing_keys = {}
    for x in inputs:
        input_data = inputs[x]
-        input_type, input_category, input_info = get_input_info(class_def, x)
-        def mark_missing():
-            missing_keys[x] = True
-            input_data_all[x] = (None,)
-        if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)):
+        if isinstance(input_data, list):
            input_unique_id = input_data[0]
            output_index = input_data[1]
-            if outputs is None:
-                mark_missing()
-                continue # This might be a lazily-evaluated input
-            cached_output = outputs.get(input_unique_id)
-            if cached_output is None:
-                mark_missing()
+            if input_unique_id not in outputs:
+                input_data_all[x] = (None,)
                continue
-            if output_index >= len(cached_output):
-                mark_missing()
-                continue
-            obj = cached_output[output_index]
+            obj = outputs[input_unique_id][output_index]
            input_data_all[x] = obj
-        elif input_category is not None:
-            input_data_all[x] = [input_data]
+        else:
+            if ("required" in valid_inputs and x in valid_inputs["required"]) or ("optional" in valid_inputs and x in valid_inputs["optional"]):
+                input_data_all[x] = [input_data]

    if "hidden" in valid_inputs:
        h = valid_inputs["hidden"]
        for x in h:
            if h[x] == "PROMPT":
-                input_data_all[x] = [dynprompt.get_original_prompt() if dynprompt is not None else {}]
-            if h[x] == "DYNPROMPT":
-                input_data_all[x] = [dynprompt]
+                input_data_all[x] = [prompt]
            if h[x] == "EXTRA_PNGINFO":
                input_data_all[x] = [extra_data.get('extra_pnginfo', None)]
            if h[x] == "UNIQUE_ID":
                input_data_all[x] = [unique_id]
-    return input_data_all, missing_keys
+    return input_data_all

-map_node_over_list = None #Don't hook this please
-
-def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None):
+def map_node_over_list(obj, input_data_all, func, allow_interrupt=False):
    # check if node wants the lists
-    input_is_list = getattr(obj, "INPUT_IS_LIST", False)
+    input_is_list = False
+    if hasattr(obj, "INPUT_IS_LIST"):
+        input_is_list = obj.INPUT_IS_LIST

    if len(input_data_all) == 0:
        max_len_input = 0
    else:
-        max_len_input = max(len(x) for x in input_data_all.values())
+        max_len_input = max([len(x) for x in input_data_all.values()])
     
    # get a slice of inputs, repeat last input when list isn't long enough
    def slice_dict(d, i):
-        return {k: v[i if len(v) > i else -1] for k, v in d.items()}
+        d_new = dict()
+        for k,v in d.items():
+            d_new[k] = v[i if len(v) > i else -1]
+        return d_new
    
    results = []
-    def process_inputs(inputs, index=None):
+    if input_is_list:
        if allow_interrupt:
            nodes.before_node_execution()
-        execution_block = None
-        for k, v in inputs.items():
-            if isinstance(v, ExecutionBlocker):
-                execution_block = execution_block_cb(v) if execution_block_cb else v
-                break
-        if execution_block is None:
-            if pre_execute_cb is not None and index is not None:
-                pre_execute_cb(index)
-            results.append(getattr(obj, func)(**inputs))
-        else:
-            results.append(execution_block)
-
-    if input_is_list:
-        process_inputs(input_data_all, 0)
+        results.append(getattr(obj, func)(**input_data_all))
    elif max_len_input == 0:
-        process_inputs({})
-    else: 
+        if allow_interrupt:
+            nodes.before_node_execution()
+        results.append(getattr(obj, func)())
+    else:
        for i in range(max_len_input):
-            input_dict = slice_dict(input_data_all, i)
-            process_inputs(input_dict, i)
+            if allow_interrupt:
+                nodes.before_node_execution()
+            results.append(getattr(obj, func)(**slice_dict(input_data_all, i)))
    return results

-def merge_result_data(results, obj):
-    # check which outputs need concatenating
-    output = []
-    output_is_list = [False] * len(results[0])
-    if hasattr(obj, "OUTPUT_IS_LIST"):
-        output_is_list = obj.OUTPUT_IS_LIST
-
-    # merge node execution results
-    for i, is_list in zip(range(len(results[0])), output_is_list):
-        if is_list:
-            output.append([x for o in results for x in o[i]])
-        else:
-            output.append([o[i] for o in results])
-    return output
-
-def get_output_data(obj, input_data_all, execution_block_cb=None, pre_execute_cb=None):
+def get_output_data(obj, input_data_all):
    
    results = []
    uis = []
-    subgraph_results = []
-    return_values = _map_node_over_list(obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb)
-    has_subgraph = False
-    for i in range(len(return_values)):
-        r = return_values[i]
+    return_values = map_node_over_list(obj, input_data_all, obj.FUNCTION, allow_interrupt=True)
+
+    for r in return_values:
        if isinstance(r, dict):
            if 'ui' in r:
                uis.append(r['ui'])
-            if 'expand' in r:
-                # Perform an expansion, but do not append results
-                has_subgraph = True
-                new_graph = r['expand']
-                result = r.get("result", None)
-                if isinstance(result, ExecutionBlocker):
-                    result = tuple([result] * len(obj.RETURN_TYPES))
-                subgraph_results.append((new_graph, result))
-            elif 'result' in r:
-                result = r.get("result", None)
-                if isinstance(result, ExecutionBlocker):
-                    result = tuple([result] * len(obj.RETURN_TYPES))
-                results.append(result)
-                subgraph_results.append((None, result))
+            if 'result' in r:
+                results.append(r['result'])
        else:
-            if isinstance(r, ExecutionBlocker):
-                r = tuple([r] * len(obj.RETURN_TYPES))
            results.append(r)
-            subgraph_results.append((None, r))
    
-    if has_subgraph:
-        output = subgraph_results
-    elif len(results) > 0:
-        output = merge_result_data(results, obj)
-    else:
-        output = []
+    output = []
+    if len(results) > 0:
+        # check which outputs need concatenating
+        output_is_list = [False] * len(results[0])
+        if hasattr(obj, "OUTPUT_IS_LIST"):
+            output_is_list = obj.OUTPUT_IS_LIST
+
+        # merge node execution results
+        for i, is_list in zip(range(len(results[0])), output_is_list):
+            if is_list:
+                output.append([x for o in results for x in o[i]])
+            else:
+                output.append([o[i] for o in results])
+
    ui = dict()    
    if len(uis) > 0:
        ui = {k: [y for x in uis for y in x[k]] for k in uis[0].keys()}
-    return output, ui, has_subgraph
+    return output, ui

 def format_value(x):
    if x is None:
@@ -235,145 +117,53 @@ def format_value(x):
    else:
        return str(x)

-def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results):
+def recursive_execute(server, prompt, outputs, current_item, extra_data, executed, prompt_id, outputs_ui, object_storage):
    unique_id = current_item
-    real_node_id = dynprompt.get_real_node_id(unique_id)
-    display_node_id = dynprompt.get_display_node_id(unique_id)
-    parent_node_id = dynprompt.get_parent_node_id(unique_id)
-    inputs = dynprompt.get_node(unique_id)['inputs']
-    class_type = dynprompt.get_node(unique_id)['class_type']
+    inputs = prompt[unique_id]['inputs']
+    class_type = prompt[unique_id]['class_type']
    class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-    if caches.outputs.get(unique_id) is not None:
-        if server.client_id is not None:
-            cached_output = caches.ui.get(unique_id) or {}
-            server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id)
-        return (ExecutionResult.SUCCESS, None, None)
+    if unique_id in outputs:
+        return (True, None, None)
+
+    for x in inputs:
+        input_data = inputs[x]
+
+        if isinstance(input_data, list):
+            input_unique_id = input_data[0]
+            output_index = input_data[1]
+            if input_unique_id not in outputs:
+                result = recursive_execute(server, prompt, outputs, input_unique_id, extra_data, executed, prompt_id, outputs_ui, object_storage)
+                if result[0] is not True:
+                    # Another node failed further upstream
+                    return result

    input_data_all = None
    try:
-        if unique_id in pending_subgraph_results:
-            cached_results = pending_subgraph_results[unique_id]
-            resolved_outputs = []
-            for is_subgraph, result in cached_results:
-                if not is_subgraph:
-                    resolved_outputs.append(result)
-                else:
-                    resolved_output = []
-                    for r in result:
-                        if is_link(r):
-                            source_node, source_output = r[0], r[1]
-                            node_output = caches.outputs.get(source_node)[source_output]
-                            for o in node_output:
-                                resolved_output.append(o)
+        input_data_all = get_input_data(inputs, class_def, unique_id, outputs, prompt, extra_data)
+        if server.client_id is not None:
+            server.last_node_id = unique_id
+            server.send_sync("executing", { "node": unique_id, "prompt_id": prompt_id }, server.client_id)

-                        else:
-                            resolved_output.append(r)
-                    resolved_outputs.append(tuple(resolved_output))
-            output_data = merge_result_data(resolved_outputs, class_def)
-            output_ui = []
-            has_subgraph = False
-        else:
-            input_data_all, missing_keys = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data)
-            if server.client_id is not None:
-                server.last_node_id = display_node_id
-                server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id)
+        obj = object_storage.get((unique_id, class_type), None)
+        if obj is None:
+            obj = class_def()
+            object_storage[(unique_id, class_type)] = obj

-            obj = caches.objects.get(unique_id)
-            if obj is None:
-                obj = class_def()
-                caches.objects.set(unique_id, obj)
-
-            if hasattr(obj, "check_lazy_status"):
-                required_inputs = _map_node_over_list(obj, input_data_all, "check_lazy_status", allow_interrupt=True)
-                required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], []))
-                required_inputs = [x for x in required_inputs if isinstance(x,str) and (
-                    x not in input_data_all or x in missing_keys
-                )]
-                if len(required_inputs) > 0:
-                    for i in required_inputs:
-                        execution_list.make_input_strong_link(unique_id, i)
-                    return (ExecutionResult.PENDING, None, None)
-
-            def execution_block_cb(block):
-                if block.message is not None:
-                    mes = {
-                        "prompt_id": prompt_id,
-                        "node_id": unique_id,
-                        "node_type": class_type,
-                        "executed": list(executed),
-
-                        "exception_message": f"Execution Blocked: {block.message}",
-                        "exception_type": "ExecutionBlocked",
-                        "traceback": [],
-                        "current_inputs": [],
-                        "current_outputs": [],
-                    }
-                    server.send_sync("execution_error", mes, server.client_id)
-                    return ExecutionBlocker(None)
-                else:
-                    return block
-            def pre_execute_cb(call_index):
-                GraphBuilder.set_default_prefix(unique_id, call_index, 0)
-            output_data, output_ui, has_subgraph = get_output_data(obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb)
+        output_data, output_ui = get_output_data(obj, input_data_all)
+        outputs[unique_id] = output_data
        if len(output_ui) > 0:
-            caches.ui.set(unique_id, {
-                "meta": {
-                    "node_id": unique_id,
-                    "display_node": display_node_id,
-                    "parent_node": parent_node_id,
-                    "real_node_id": real_node_id,
-                },
-                "output": output_ui
-            })
+            outputs_ui[unique_id] = output_ui
            if server.client_id is not None:
-                server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id }, server.client_id)
-        if has_subgraph:
-            cached_outputs = []
-            new_node_ids = []
-            new_output_ids = []
-            new_output_links = []
-            for i in range(len(output_data)):
-                new_graph, node_outputs = output_data[i]
-                if new_graph is None:
-                    cached_outputs.append((False, node_outputs))
-                else:
-                    # Check for conflicts
-                    for node_id in new_graph.keys():
-                        if dynprompt.has_node(node_id):
-                            raise DuplicateNodeError(f"Attempt to add duplicate node {node_id}. Ensure node ids are unique and deterministic or use graph_utils.GraphBuilder.")
-                    for node_id, node_info in new_graph.items():
-                        new_node_ids.append(node_id)
-                        display_id = node_info.get("override_display_id", unique_id)
-                        dynprompt.add_ephemeral_node(node_id, node_info, unique_id, display_id)
-                        # Figure out if the newly created node is an output node
-                        class_type = node_info["class_type"]
-                        class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-                        if hasattr(class_def, 'OUTPUT_NODE') and class_def.OUTPUT_NODE == True:
-                            new_output_ids.append(node_id)
-                    for i in range(len(node_outputs)):
-                        if is_link(node_outputs[i]):
-                            from_node_id, from_socket = node_outputs[i][0], node_outputs[i][1]
-                            new_output_links.append((from_node_id, from_socket))
-                    cached_outputs.append((True, node_outputs))
-            new_node_ids = set(new_node_ids)
-            for cache in caches.all:
-                cache.ensure_subcache_for(unique_id, new_node_ids).clean_unused()
-            for node_id in new_output_ids:
-                execution_list.add_node(node_id)
-            for link in new_output_links:
-                execution_list.add_strong_link(link[0], link[1], unique_id)
-            pending_subgraph_results[unique_id] = cached_outputs
-            return (ExecutionResult.PENDING, None, None)
-        caches.outputs.set(unique_id, output_data)
+                server.send_sync("executed", { "node": unique_id, "output": output_ui, "prompt_id": prompt_id }, server.client_id)
    except comfy.model_management.InterruptProcessingException as iex:
        logging.info("Processing interrupted")

        # skip formatting inputs/outputs
        error_details = {
-            "node_id": real_node_id,
+            "node_id": unique_id,
        }

-        return (ExecutionResult.FAILURE, error_details, iex)
+        return (False, error_details, iex)
    except Exception as ex:
        typ, _, tb = sys.exc_info()
        exception_type = full_type_name(typ)
@@ -383,36 +173,116 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp
            for name, inputs in input_data_all.items():
                input_data_formatted[name] = [format_value(x) for x in inputs]

-        logging.error(f"!!! Exception during processing !!! {ex}")
+        output_data_formatted = {}
+        for node_id, node_outputs in outputs.items():
+            output_data_formatted[node_id] = [[format_value(x) for x in l] for l in node_outputs]
+
+        logging.error(f"!!! Exception during processing!!! {ex}")
        logging.error(traceback.format_exc())

        error_details = {
-            "node_id": real_node_id,
+            "node_id": unique_id,
            "exception_message": str(ex),
            "exception_type": exception_type,
            "traceback": traceback.format_tb(tb),
-            "current_inputs": input_data_formatted
+            "current_inputs": input_data_formatted,
+            "current_outputs": output_data_formatted
        }
-        if isinstance(ex, comfy.model_management.OOM_EXCEPTION):
-            logging.error("Got an OOM, unloading all loaded models.")
-            comfy.model_management.unload_all_models()
-
-        return (ExecutionResult.FAILURE, error_details, ex)
+        return (False, error_details, ex)

    executed.add(unique_id)

-    return (ExecutionResult.SUCCESS, None, None)
+    return (True, None, None)
+
+def recursive_will_execute(prompt, outputs, current_item, memo={}):
+    unique_id = current_item
+
+    if unique_id in memo:
+        return memo[unique_id]
+
+    inputs = prompt[unique_id]['inputs']
+    will_execute = []
+    if unique_id in outputs:
+        return []
+
+    for x in inputs:
+        input_data = inputs[x]
+        if isinstance(input_data, list):
+            input_unique_id = input_data[0]
+            output_index = input_data[1]
+            if input_unique_id not in outputs:
+                will_execute += recursive_will_execute(prompt, outputs, input_unique_id, memo)
+
+    memo[unique_id] = will_execute + [unique_id]
+    return memo[unique_id]
+
+def recursive_output_delete_if_changed(prompt, old_prompt, outputs, current_item):
+    unique_id = current_item
+    inputs = prompt[unique_id]['inputs']
+    class_type = prompt[unique_id]['class_type']
+    class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
+
+    is_changed_old = ''
+    is_changed = ''
+    to_delete = False
+    if hasattr(class_def, 'IS_CHANGED'):
+        if unique_id in old_prompt and 'is_changed' in old_prompt[unique_id]:
+            is_changed_old = old_prompt[unique_id]['is_changed']
+        if 'is_changed' not in prompt[unique_id]:
+            input_data_all = get_input_data(inputs, class_def, unique_id, outputs)
+            if input_data_all is not None:
+                try:
+                    #is_changed = class_def.IS_CHANGED(**input_data_all)
+                    is_changed = map_node_over_list(class_def, input_data_all, "IS_CHANGED")
+                    prompt[unique_id]['is_changed'] = is_changed
+                except:
+                    to_delete = True
+        else:
+            is_changed = prompt[unique_id]['is_changed']
+
+    if unique_id not in outputs:
+        return True
+
+    if not to_delete:
+        if is_changed != is_changed_old:
+            to_delete = True
+        elif unique_id not in old_prompt:
+            to_delete = True
+        elif class_type != old_prompt[unique_id]['class_type']:
+            to_delete = True
+        elif inputs == old_prompt[unique_id]['inputs']:
+            for x in inputs:
+                input_data = inputs[x]
+
+                if isinstance(input_data, list):
+                    input_unique_id = input_data[0]
+                    output_index = input_data[1]
+                    if input_unique_id in outputs:
+                        to_delete = recursive_output_delete_if_changed(prompt, old_prompt, outputs, input_unique_id)
+                    else:
+                        to_delete = True
+                    if to_delete:
+                        break
+        else:
+            to_delete = True
+
+    if to_delete:
+        d = outputs.pop(unique_id)
+        del d
+    return to_delete

 class PromptExecutor:
-    def __init__(self, server, lru_size=None):
-        self.lru_size = lru_size
+    def __init__(self, server):
        self.server = server
        self.reset()

    def reset(self):
-        self.caches = CacheSet(self.lru_size)
+        self.outputs = {}
+        self.object_storage = {}
+        self.outputs_ui = {}
        self.status_messages = []
        self.success = True
+        self.old_prompt = {}

    def add_message(self, event, data: dict, broadcast: bool):
        data = {
@@ -443,13 +313,26 @@ class PromptExecutor:
                "node_id": node_id,
                "node_type": class_type,
                "executed": list(executed),
+
                "exception_message": error["exception_message"],
                "exception_type": error["exception_type"],
                "traceback": error["traceback"],
                "current_inputs": error["current_inputs"],
-                "current_outputs": list(current_outputs),
+                "current_outputs": error["current_outputs"],
            }
            self.add_message("execution_error", mes, broadcast=False)
+        
+        # Next, remove the subsequent outputs since they will not be executed
+        to_delete = []
+        for o in self.outputs:
+            if (o not in current_outputs) and (o not in executed):
+                to_delete += [o]
+                if o in self.old_prompt:
+                    d = self.old_prompt.pop(o)
+                    del d
+        for o in to_delete:
+            d = self.outputs.pop(o)
+            del d

    def execute(self, prompt, prompt_id, extra_data={}, execute_outputs=[]):
        nodes.interrupt_processing(False)
@@ -463,59 +346,65 @@ class PromptExecutor:
        self.add_message("execution_start", { "prompt_id": prompt_id}, broadcast=False)

        with torch.inference_mode():
-            dynamic_prompt = DynamicPrompt(prompt)
-            is_changed_cache = IsChangedCache(dynamic_prompt, self.caches.outputs)
-            for cache in self.caches.all:
-                cache.set_prompt(dynamic_prompt, prompt.keys(), is_changed_cache)
-                cache.clean_unused()
+            #delete cached outputs if nodes don't exist for them
+            to_delete = []
+            for o in self.outputs:
+                if o not in prompt:
+                    to_delete += [o]
+            for o in to_delete:
+                d = self.outputs.pop(o)
+                del d
+            to_delete = []
+            for o in self.object_storage:
+                if o[0] not in prompt:
+                    to_delete += [o]
+                else:
+                    p = prompt[o[0]]
+                    if o[1] != p['class_type']:
+                        to_delete += [o]
+            for o in to_delete:
+                d = self.object_storage.pop(o)
+                del d

-            cached_nodes = []
-            for node_id in prompt:
-                if self.caches.outputs.get(node_id) is not None:
-                    cached_nodes.append(node_id)
+            for x in prompt:
+                recursive_output_delete_if_changed(prompt, self.old_prompt, self.outputs, x)
+
+            current_outputs = set(self.outputs.keys())
+            for x in list(self.outputs_ui.keys()):
+                if x not in current_outputs:
+                    d = self.outputs_ui.pop(x)
+                    del d

            comfy.model_management.cleanup_models(keep_clone_weights_loaded=True)
            self.add_message("execution_cached",
-                          { "nodes": cached_nodes, "prompt_id": prompt_id},
+                          { "nodes": list(current_outputs) , "prompt_id": prompt_id},
                          broadcast=False)
-            pending_subgraph_results = {}
            executed = set()
-            execution_list = ExecutionList(dynamic_prompt, self.caches.outputs)
-            current_outputs = self.caches.outputs.all_node_ids()
+            output_node_id = None
+            to_execute = []
+
            for node_id in list(execute_outputs):
-                execution_list.add_node(node_id)
+                to_execute += [(0, node_id)]

-            while not execution_list.is_empty():
-                node_id, error, ex = execution_list.stage_node_execution()
-                if error is not None:
-                    self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex)
-                    break
+            while len(to_execute) > 0:
+                #always execute the output that depends on the least amount of unexecuted nodes first
+                memo = {}
+                to_execute = sorted(list(map(lambda a: (len(recursive_will_execute(prompt, self.outputs, a[-1], memo)), a[-1]), to_execute)))
+                output_node_id = to_execute.pop(0)[-1]

-                result, error, ex = execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results)
-                self.success = result != ExecutionResult.FAILURE
-                if result == ExecutionResult.FAILURE:
-                    self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex)
+                # This call shouldn't raise anything if there's an error deep in
+                # the actual SD code, instead it will report the node where the
+                # error was raised
+                self.success, error, ex = recursive_execute(self.server, prompt, self.outputs, output_node_id, extra_data, executed, prompt_id, self.outputs_ui, self.object_storage)
+                if self.success is not True:
+                    self.handle_execution_error(prompt_id, prompt, current_outputs, executed, error, ex)
                    break
-                elif result == ExecutionResult.PENDING:
-                    execution_list.unstage_node_execution()
-                else: # result == ExecutionResult.SUCCESS:
-                    execution_list.complete_node_execution()
            else:
                # Only execute when the while-loop ends without break
                self.add_message("execution_success", { "prompt_id": prompt_id }, broadcast=False)

-            ui_outputs = {}
-            meta_outputs = {}
-            all_node_ids = self.caches.ui.all_node_ids()
-            for node_id in all_node_ids:
-                ui_info = self.caches.ui.get(node_id)
-                if ui_info is not None:
-                    ui_outputs[node_id] = ui_info["output"]
-                    meta_outputs[node_id] = ui_info["meta"]
-            self.history_result = {
-                "outputs": ui_outputs,
-                "meta": meta_outputs,
-            }
+            for x in executed:
+                self.old_prompt[x] = copy.deepcopy(prompt[x])
            self.server.last_node_id = None
            if comfy.model_management.DISABLE_SMART_MEMORY:
                comfy.model_management.unload_all_models()
@@ -532,37 +421,31 @@ def validate_inputs(prompt, item, validated):
    obj_class = nodes.NODE_CLASS_MAPPINGS[class_type]

    class_inputs = obj_class.INPUT_TYPES()
-    valid_inputs = set(class_inputs.get('required',{})).union(set(class_inputs.get('optional',{})))
+    required_inputs = class_inputs['required']

    errors = []
    valid = True

    validate_function_inputs = []
-    validate_has_kwargs = False
    if hasattr(obj_class, "VALIDATE_INPUTS"):
-        argspec = inspect.getfullargspec(obj_class.VALIDATE_INPUTS)
-        validate_function_inputs = argspec.args
-        validate_has_kwargs = argspec.varkw is not None
-    received_types = {}
+        validate_function_inputs = inspect.getfullargspec(obj_class.VALIDATE_INPUTS).args

-    for x in valid_inputs:
-        type_input, input_category, extra_info = get_input_info(obj_class, x)
-        assert extra_info is not None
+    for x in required_inputs:
        if x not in inputs:
-            if input_category == "required":
-                error = {
-                    "type": "required_input_missing",
-                    "message": "Required input is missing",
-                    "details": f"{x}",
-                    "extra_info": {
-                        "input_name": x
-                    }
+            error = {
+                "type": "required_input_missing",
+                "message": "Required input is missing",
+                "details": f"{x}",
+                "extra_info": {
+                    "input_name": x
                }
-                errors.append(error)
+            }
+            errors.append(error)
            continue

        val = inputs[x]
-        info = (type_input, extra_info)
+        info = required_inputs[x]
+        type_input = info[0]
        if isinstance(val, list):
            if len(val) != 2:
                error = {
@@ -581,9 +464,8 @@ def validate_inputs(prompt, item, validated):
            o_id = val[0]
            o_class_type = prompt[o_id]['class_type']
            r = nodes.NODE_CLASS_MAPPINGS[o_class_type].RETURN_TYPES
-            received_type = r[val[1]]
-            received_types[x] = received_type
-            if 'input_types' not in validate_function_inputs and received_type != type_input:
+            if r[val[1]] != type_input:
+                received_type = r[val[1]]
                details = f"{x}, {received_type} != {type_input}"
                error = {
                    "type": "return_type_mismatch",
@@ -634,9 +516,6 @@ def validate_inputs(prompt, item, validated):
                if type_input == "STRING":
                    val = str(val)
                    inputs[x] = val
-                if type_input == "BOOLEAN":
-                    val = bool(val)
-                    inputs[x] = val
            except Exception as ex:
                error = {
                    "type": "invalid_input_type",
@@ -652,11 +531,11 @@ def validate_inputs(prompt, item, validated):
                errors.append(error)
                continue

-            if x not in validate_function_inputs and not validate_has_kwargs:
-                if "min" in extra_info and val < extra_info["min"]:
+            if len(info) > 1:
+                if "min" in info[1] and val < info[1]["min"]:
                    error = {
                        "type": "value_smaller_than_min",
-                        "message": "Value {} smaller than min of {}".format(val, extra_info["min"]),
+                        "message": "Value {} smaller than min of {}".format(val, info[1]["min"]),
                        "details": f"{x}",
                        "extra_info": {
                            "input_name": x,
@@ -666,10 +545,10 @@ def validate_inputs(prompt, item, validated):
                    }
                    errors.append(error)
                    continue
-                if "max" in extra_info and val > extra_info["max"]:
+                if "max" in info[1] and val > info[1]["max"]:
                    error = {
                        "type": "value_bigger_than_max",
-                        "message": "Value {} bigger than max of {}".format(val, extra_info["max"]),
+                        "message": "Value {} bigger than max of {}".format(val, info[1]["max"]),
                        "details": f"{x}",
                        "extra_info": {
                            "input_name": x,
@@ -680,6 +559,7 @@ def validate_inputs(prompt, item, validated):
                    errors.append(error)
                    continue

+            if x not in validate_function_inputs:
                if isinstance(type_input, list):
                    if val not in type_input:
                        input_config = info
@@ -706,20 +586,18 @@ def validate_inputs(prompt, item, validated):
                        errors.append(error)
                        continue

-    if len(validate_function_inputs) > 0 or validate_has_kwargs:
-        input_data_all, _ = get_input_data(inputs, obj_class, unique_id)
+    if len(validate_function_inputs) > 0:
+        input_data_all = get_input_data(inputs, obj_class, unique_id)
        input_filtered = {}
        for x in input_data_all:
-            if x in validate_function_inputs or validate_has_kwargs:
+            if x in validate_function_inputs:
                input_filtered[x] = input_data_all[x]
-        if 'input_types' in validate_function_inputs:
-            input_filtered['input_types'] = [received_types]

        #ret = obj_class.VALIDATE_INPUTS(**input_filtered)
-        ret = _map_node_over_list(obj_class, input_filtered, "VALIDATE_INPUTS")
+        ret = map_node_over_list(obj_class, input_filtered, "VALIDATE_INPUTS")
        for x in input_filtered:
            for i, r in enumerate(ret):
-                if r is not True and not isinstance(r, ExecutionBlocker):
+                if r is not True:
                    details = f"{x}"
                    if r is not False:
                        details += f" - {str(r)}"
@@ -730,6 +608,8 @@ def validate_inputs(prompt, item, validated):
                        "details": details,
                        "extra_info": {
                            "input_name": x,
+                            "input_config": info,
+                            "received_value": val,
                        }
                    }
                    errors.append(error)
@@ -895,7 +775,7 @@ class PromptQueue:
        completed: bool
        messages: List[str]

-    def task_done(self, item_id, history_result,
+    def task_done(self, item_id, outputs,
                  status: Optional['PromptQueue.ExecutionStatus']):
        with self.mutex:
            prompt = self.currently_running.pop(item_id)
@@ -908,10 +788,9 @@ class PromptQueue:

            self.history[prompt[1]] = {
                "prompt": prompt,
-                "outputs": {},
+                "outputs": copy.deepcopy(outputs),
                'status': status_dict,
            }
-            self.history[prompt[1]].update(history_result)
            self.server.queue_updated()

    def get_current_queue(self):
--- a/folder_paths.py
+++ b/folder_paths.py
@@ -1,13 +1,13 @@
-from __future__ import annotations
-
 import os
 import time
 import logging
-from collections.abc import Collection
+from typing import Set, List, Dict, Tuple

-supported_pt_extensions: set[str] = {'.ckpt', '.pt', '.bin', '.pth', '.safetensors', '.pkl', '.sft'}
+supported_pt_extensions: Set[str] = set(['.ckpt', '.pt', '.bin', '.pth', '.safetensors', '.pkl'])

-folder_names_and_paths: dict[str, tuple[list[str], set[str]]] = {}
+SupportedFileExtensionsType = Set[str]
+ScanPathType = List[str]
+folder_names_and_paths: Dict[str, Tuple[ScanPathType, SupportedFileExtensionsType]] = {}

 base_path = os.path.dirname(os.path.realpath(__file__))
 models_dir = os.path.join(base_path, "models")
@@ -17,7 +17,7 @@ folder_names_and_paths["configs"] = ([os.path.join(models_dir, "configs")], [".y
 folder_names_and_paths["loras"] = ([os.path.join(models_dir, "loras")], supported_pt_extensions)
 folder_names_and_paths["vae"] = ([os.path.join(models_dir, "vae")], supported_pt_extensions)
 folder_names_and_paths["clip"] = ([os.path.join(models_dir, "clip")], supported_pt_extensions)
-folder_names_and_paths["diffusion_models"] = ([os.path.join(models_dir, "unet"), os.path.join(models_dir, "diffusion_models")], supported_pt_extensions)
+folder_names_and_paths["unet"] = ([os.path.join(models_dir, "unet")], supported_pt_extensions)
 folder_names_and_paths["clip_vision"] = ([os.path.join(models_dir, "clip_vision")], supported_pt_extensions)
 folder_names_and_paths["style_models"] = ([os.path.join(models_dir, "style_models")], supported_pt_extensions)
 folder_names_and_paths["embeddings"] = ([os.path.join(models_dir, "embeddings")], supported_pt_extensions)
@@ -42,11 +42,7 @@ temp_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "temp
 input_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "input")
 user_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "user")

-filename_list_cache: dict[str, tuple[list[str], dict[str, float], float]] = {}
-
-def map_legacy(folder_name: str) -> str:
-    legacy = {"unet": "diffusion_models"}
-    return legacy.get(folder_name, folder_name)
+filename_list_cache = {}

 if not os.path.exists(input_directory):
    try:
@@ -54,33 +50,33 @@ if not os.path.exists(input_directory):
    except:
        logging.error("Failed to create input directory")

-def set_output_directory(output_dir: str) -> None:
+def set_output_directory(output_dir):
    global output_directory
    output_directory = output_dir

-def set_temp_directory(temp_dir: str) -> None:
+def set_temp_directory(temp_dir):
    global temp_directory
    temp_directory = temp_dir

-def set_input_directory(input_dir: str) -> None:
+def set_input_directory(input_dir):
    global input_directory
    input_directory = input_dir

-def get_output_directory() -> str:
+def get_output_directory():
    global output_directory
    return output_directory

-def get_temp_directory() -> str:
+def get_temp_directory():
    global temp_directory
    return temp_directory

-def get_input_directory() -> str:
+def get_input_directory():
    global input_directory
    return input_directory


 #NOTE: used in http server so don't put folders that should not be accessed remotely
-def get_directory_by_type(type_name: str) -> str | None:
+def get_directory_by_type(type_name):
    if type_name == "output":
        return get_output_directory()
    if type_name == "temp":
@@ -92,7 +88,7 @@ def get_directory_by_type(type_name: str) -> str | None:

 # determine base_dir rely on annotation if name is 'filename.ext [annotation]' format
 # otherwise use default_path as base_dir
-def annotated_filepath(name: str) -> tuple[str, str | None]:
+def annotated_filepath(name):
    if name.endswith("[output]"):
        base_dir = get_output_directory()
        name = name[:-9]
@@ -108,7 +104,7 @@ def annotated_filepath(name: str) -> tuple[str, str | None]:
    return name, base_dir


-def get_annotated_filepath(name: str, default_dir: str | None=None) -> str:
+def get_annotated_filepath(name, default_dir=None):
    name, base_dir = annotated_filepath(name)

    if base_dir is None:
@@ -120,7 +116,7 @@ def get_annotated_filepath(name: str, default_dir: str | None=None) -> str:
    return os.path.join(base_dir, name)


-def exists_annotated_filepath(name) -> bool:
+def exists_annotated_filepath(name):
    name, base_dir = annotated_filepath(name)

    if base_dir is None:
@@ -130,19 +126,17 @@ def exists_annotated_filepath(name) -> bool:
    return os.path.exists(filepath)


-def add_model_folder_path(folder_name: str, full_folder_path: str) -> None:
+def add_model_folder_path(folder_name, full_folder_path):
    global folder_names_and_paths
-    folder_name = map_legacy(folder_name)
    if folder_name in folder_names_and_paths:
        folder_names_and_paths[folder_name][0].append(full_folder_path)
    else:
        folder_names_and_paths[folder_name] = ([full_folder_path], set())

-def get_folder_paths(folder_name: str) -> list[str]:
-    folder_name = map_legacy(folder_name)
+def get_folder_paths(folder_name):
    return folder_names_and_paths[folder_name][0][:]

-def recursive_search(directory: str, excluded_dir_names: list[str] | None=None) -> tuple[list[str], dict[str, float]]:
+def recursive_search(directory, excluded_dir_names=None):
    if not os.path.isdir(directory):
        return [], {}

@@ -159,10 +153,6 @@ def recursive_search(directory: str, excluded_dir_names: list[str] | None=None)
        logging.warning(f"Warning: Unable to access {directory}. Skipping this path.")

    logging.debug("recursive file list on directory {}".format(directory))
-    dirpath: str
-    subdirs: list[str]
-    filenames: list[str]
-
    for dirpath, subdirs, filenames in os.walk(directory, followlinks=True, topdown=True):
        subdirs[:] = [d for d in subdirs if d not in excluded_dir_names]
        for file_name in filenames:
@@ -170,7 +160,7 @@ def recursive_search(directory: str, excluded_dir_names: list[str] | None=None)
            result.append(relative_path)

        for d in subdirs:
-            path: str = os.path.join(dirpath, d)
+            path = os.path.join(dirpath, d)
            try:
                dirs[path] = os.path.getmtime(path)
            except FileNotFoundError:
@@ -179,14 +169,13 @@ def recursive_search(directory: str, excluded_dir_names: list[str] | None=None)
    logging.debug("found {} files".format(len(result)))
    return result, dirs

-def filter_files_extensions(files: Collection[str], extensions: Collection[str]) -> list[str]:
+def filter_files_extensions(files, extensions):
    return sorted(list(filter(lambda a: os.path.splitext(a)[-1].lower() in extensions or len(extensions) == 0, files)))



-def get_full_path(folder_name: str, filename: str) -> str | None:
+def get_full_path(folder_name, filename):
    global folder_names_and_paths
-    folder_name = map_legacy(folder_name)
    if folder_name not in folder_names_and_paths:
        return None
    folders = folder_names_and_paths[folder_name]
@@ -200,8 +189,7 @@ def get_full_path(folder_name: str, filename: str) -> str | None:

    return None

-def get_filename_list_(folder_name: str) -> tuple[list[str], dict[str, float], float]:
-    folder_name = map_legacy(folder_name)
+def get_filename_list_(folder_name):
    global folder_names_and_paths
    output_list = set()
    folders = folder_names_and_paths[folder_name]
@@ -211,12 +199,11 @@ def get_filename_list_(folder_name: str) -> tuple[list[str], dict[str, float], f
        output_list.update(filter_files_extensions(files, folders[1]))
        output_folders = {**output_folders, **folders_all}

-    return sorted(list(output_list)), output_folders, time.perf_counter()
+    return (sorted(list(output_list)), output_folders, time.perf_counter())

-def cached_filename_list_(folder_name: str) -> tuple[list[str], dict[str, float], float] | None:
+def cached_filename_list_(folder_name):
    global filename_list_cache
    global folder_names_and_paths
-    folder_name = map_legacy(folder_name)
    if folder_name not in filename_list_cache:
        return None
    out = filename_list_cache[folder_name]
@@ -235,8 +222,7 @@ def cached_filename_list_(folder_name: str) -> tuple[list[str], dict[str, float]

    return out

-def get_filename_list(folder_name: str) -> list[str]:
-    folder_name = map_legacy(folder_name)
+def get_filename_list(folder_name):
    out = cached_filename_list_(folder_name)
    if out is None:
        out = get_filename_list_(folder_name)
@@ -244,17 +230,17 @@ def get_filename_list(folder_name: str) -> list[str]:
        filename_list_cache[folder_name] = out
    return list(out[0])

-def get_save_image_path(filename_prefix: str, output_dir: str, image_width=0, image_height=0) -> tuple[str, str, int, str, str]:
-    def map_filename(filename: str) -> tuple[int, str]:
+def get_save_image_path(filename_prefix, output_dir, image_width=0, image_height=0):
+    def map_filename(filename):
        prefix_len = len(os.path.basename(filename_prefix))
        prefix = filename[:prefix_len + 1]
        try:
            digits = int(filename[prefix_len + 1:].split('_')[0])
        except:
            digits = 0
-        return digits, prefix
+        return (digits, prefix)

-    def compute_vars(input: str, image_width: int, image_height: int) -> str:
+    def compute_vars(input, image_width, image_height):
        input = input.replace("%width%", str(image_width))
        input = input.replace("%height%", str(image_height))
        return input
--- a/main.py
+++ b/main.py
@@ -101,7 +101,7 @@ def cuda_malloc_warning():
            logging.warning("\nWARNING: this card most likely does not support cuda-malloc, if you get \"CUDA error\" please run ComfyUI with: --disable-cuda-malloc\n")

 def prompt_worker(q, server):
-    e = execution.PromptExecutor(server, lru_size=args.cache_lru)
+    e = execution.PromptExecutor(server)
    last_gc_collect = 0
    need_gc = False
    gc_collect_interval = 10.0
@@ -121,7 +121,7 @@ def prompt_worker(q, server):
            e.execute(item[2], prompt_id, item[3], item[4])
            need_gc = True
            q.task_done(item_id,
-                        e.history_result,
+                        e.outputs_ui,
                        status=execution.PromptQueue.ExecutionStatus(
                            status_str='success' if e.success else 'error',
                            completed=e.success,
@@ -242,7 +242,6 @@ if __name__ == "__main__":
    folder_paths.add_model_folder_path("checkpoints", os.path.join(folder_paths.get_output_directory(), "checkpoints"))
    folder_paths.add_model_folder_path("clip", os.path.join(folder_paths.get_output_directory(), "clip"))
    folder_paths.add_model_folder_path("vae", os.path.join(folder_paths.get_output_directory(), "vae"))
-    folder_paths.add_model_folder_path("diffusion_models", os.path.join(folder_paths.get_output_directory(), "diffusion_models"))

    if args.input_directory:
        input_dir = os.path.abspath(args.input_directory)
@@ -262,7 +261,6 @@ if __name__ == "__main__":
        call_on_start = startup_server

    try:
-        loop.run_until_complete(server.setup())
        loop.run_until_complete(run(server, address=args.listen, port=args.port, verbose=not args.dont_print_server, call_on_start=call_on_start))
    except KeyboardInterrupt:
        logging.info("\nStopped server")
--- a/model_filemanager/init.py
+++ b/model_filemanager/init.py
@@ -1,2 +0,0 @@
-# model_manager/__init__.py
-from .download_models import download_model, DownloadModelStatus, DownloadStatusType, create_model_path, check_file_exists, track_download_progress, validate_model_subdirectory, validate_filename
--- a/model_filemanager/download_models.py
+++ b/model_filemanager/download_models.py
@@ -1,240 +0,0 @@
-from __future__ import annotations
-import aiohttp
-import os
-import traceback
-import logging
-from folder_paths import models_dir
-import re
-from typing import Callable, Any, Optional, Awaitable, Dict
-from enum import Enum
-import time
-from dataclasses import dataclass
-
-
-class DownloadStatusType(Enum):
-    PENDING = "pending"
-    IN_PROGRESS = "in_progress"
-    COMPLETED = "completed"
-    ERROR = "error"
-
-@dataclass
-class DownloadModelStatus():
-    status: str
-    progress_percentage: float
-    message: str
-    already_existed: bool = False
-
-    def __init__(self, status: DownloadStatusType, progress_percentage: float, message: str, already_existed: bool):
-        self.status = status.value  # Store the string value of the Enum
-        self.progress_percentage = progress_percentage
-        self.message = message
-        self.already_existed = already_existed
-    
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            "status": self.status,
-            "progress_percentage": self.progress_percentage,
-            "message": self.message,
-            "already_existed": self.already_existed
-        }
-
-async def download_model(model_download_request: Callable[[str], Awaitable[aiohttp.ClientResponse]],
-                         model_name: str,  
-                         model_url: str, 
-                         model_sub_directory: str,
-                         progress_callback: Callable[[str, DownloadModelStatus], Awaitable[Any]],
-                         progress_interval: float = 1.0) -> DownloadModelStatus:
-    """
-    Download a model file from a given URL into the models directory.
-
-    Args:
-        model_download_request (Callable[[str], Awaitable[aiohttp.ClientResponse]]): 
-            A function that makes an HTTP request. This makes it easier to mock in unit tests.
-        model_name (str): 
-            The name of the model file to be downloaded. This will be the filename on disk.
-        model_url (str): 
-            The URL from which to download the model.
-        model_sub_directory (str): 
-            The subdirectory within the main models directory where the model 
-            should be saved (e.g., 'checkpoints', 'loras', etc.).
-        progress_callback (Callable[[str, DownloadModelStatus], Awaitable[Any]]): 
-            An asynchronous function to call with progress updates.
-
-    Returns:
-        DownloadModelStatus: The result of the download operation.
-    """
-    if not validate_model_subdirectory(model_sub_directory):
-        return DownloadModelStatus(
-            DownloadStatusType.ERROR, 
-            0,
-            "Invalid model subdirectory", 
-            False
-        )
-
-    if not validate_filename(model_name):
-        return DownloadModelStatus(
-            DownloadStatusType.ERROR, 
-            0,
-            "Invalid model name", 
-            False
-        )
-
-    file_path, relative_path = create_model_path(model_name, model_sub_directory, models_dir)
-    existing_file = await check_file_exists(file_path, model_name, progress_callback, relative_path)
-    if existing_file:
-        return existing_file
-
-    try:
-        status = DownloadModelStatus(DownloadStatusType.PENDING, 0, f"Starting download of {model_name}", False)
-        await progress_callback(relative_path, status)
-
-        response = await model_download_request(model_url)
-        if response.status != 200:
-            error_message = f"Failed to download {model_name}. Status code: {response.status}"
-            logging.error(error_message)
-            status = DownloadModelStatus(DownloadStatusType.ERROR, 0, error_message, False)
-            await progress_callback(relative_path, status)
-            return DownloadModelStatus(DownloadStatusType.ERROR, 0, error_message, False)
-
-        return await track_download_progress(response, file_path, model_name, progress_callback, relative_path, progress_interval)
-
-    except Exception as e:
-        logging.error(f"Error in downloading model: {e}")
-        return await handle_download_error(e, model_name, progress_callback, relative_path)
-    
-
-def create_model_path(model_name: str, model_directory: str, models_base_dir: str) -> tuple[str, str]:
-    full_model_dir = os.path.join(models_base_dir, model_directory)
-    os.makedirs(full_model_dir, exist_ok=True)
-    file_path = os.path.join(full_model_dir, model_name)
-
-    # Ensure the resulting path is still within the base directory
-    abs_file_path = os.path.abspath(file_path)
-    abs_base_dir = os.path.abspath(str(models_base_dir))
-    if os.path.commonprefix([abs_file_path, abs_base_dir]) != abs_base_dir:
-        raise Exception(f"Invalid model directory: {model_directory}/{model_name}")
-
-
-    relative_path = '/'.join([model_directory, model_name])
-    return file_path, relative_path
-
-async def check_file_exists(file_path: str, 
-                            model_name: str, 
-                            progress_callback: Callable[[str, DownloadModelStatus], Awaitable[Any]], 
-                            relative_path: str) -> Optional[DownloadModelStatus]:
-    if os.path.exists(file_path):
-        status = DownloadModelStatus(DownloadStatusType.COMPLETED, 100, f"{model_name} already exists", True)
-        await progress_callback(relative_path, status)
-        return status
-    return None
-
-
-async def track_download_progress(response: aiohttp.ClientResponse, 
-                                  file_path: str, 
-                                  model_name: str, 
-                                  progress_callback: Callable[[str, DownloadModelStatus], Awaitable[Any]], 
-                                  relative_path: str, 
-                                  interval: float = 1.0) -> DownloadModelStatus:
-    try:
-        total_size = int(response.headers.get('Content-Length', 0))
-        downloaded = 0
-        last_update_time = time.time()
-
-        async def update_progress():
-            nonlocal last_update_time
-            progress = (downloaded / total_size) * 100 if total_size > 0 else 0
-            status = DownloadModelStatus(DownloadStatusType.IN_PROGRESS, progress, f"Downloading {model_name}", False)
-            await progress_callback(relative_path, status)
-            last_update_time = time.time()
-
-        with open(file_path, 'wb') as f:
-            chunk_iterator = response.content.iter_chunked(8192)
-            while True:
-                try:
-                    chunk = await chunk_iterator.__anext__()
-                except StopAsyncIteration:
-                    break
-                f.write(chunk)
-                downloaded += len(chunk)
-                
-                if time.time() - last_update_time >= interval:
-                    await update_progress()
-
-        await update_progress()
-        
-        logging.info(f"Successfully downloaded {model_name}. Total downloaded: {downloaded}")
-        status = DownloadModelStatus(DownloadStatusType.COMPLETED, 100, f"Successfully downloaded {model_name}", False)
-        await progress_callback(relative_path, status)
-
-        return status
-    except Exception as e:
-        logging.error(f"Error in track_download_progress: {e}")
-        logging.error(traceback.format_exc())
-        return await handle_download_error(e, model_name, progress_callback, relative_path)
-
-async def handle_download_error(e: Exception, 
-                                model_name: str, 
-                                progress_callback: Callable[[str, DownloadModelStatus], Any], 
-                                relative_path: str) -> DownloadModelStatus:
-    error_message = f"Error downloading {model_name}: {str(e)}"
-    status = DownloadModelStatus(DownloadStatusType.ERROR, 0, error_message, False)
-    await progress_callback(relative_path, status)
-    return status
-
-def validate_model_subdirectory(model_subdirectory: str) -> bool:
-    """
-    Validate that the model subdirectory is safe to install into. 
-    Must not contain relative paths, nested paths or special characters
-    other than underscores and hyphens.
-
-    Args:
-        model_subdirectory (str): The subdirectory for the specific model type.
-
-    Returns:
-        bool: True if the subdirectory is safe, False otherwise.
-    """
-    if len(model_subdirectory) > 50:
-        return False
-
-    if '..' in model_subdirectory or '/' in model_subdirectory:
-        return False
-
-    if not re.match(r'^[a-zA-Z0-9_-]+$', model_subdirectory):
-        return False
-
-    return True
-
-def validate_filename(filename: str)-> bool:
-    """
-    Validate a filename to ensure it's safe and doesn't contain any path traversal attempts.
-    
-    Args:
-    filename (str): The filename to validate
-
-    Returns:
-    bool: True if the filename is valid, False otherwise
-    """
-    if not filename.lower().endswith(('.sft', '.safetensors')):
-        return False
-
-    # Check if the filename is empty, None, or just whitespace
-    if not filename or not filename.strip():
-        return False
-
-    # Check for any directory traversal attempts or invalid characters
-    if any(char in filename for char in ['..', '/', '\\', '\n', '\r', '\t', '\0']):
-        return False
-
-    # Check if the filename starts with a dot (hidden file)
-    if filename.startswith('.'):
-        return False
-
-    # Use a whitelist of allowed characters
-    if not re.match(r'^[a-zA-Z0-9_\-. ]+$', filename):
-        return False
-
-    # Ensure the filename isn't too long
-    if len(filename) > 255:
-        return False
-
-    return True
--- a/Show More
+++ b/Show More