Add pre-commit configuration and update README for backend development

2025-03-02 12:43:19 -08:00
212 changed files with 1360 additions and 50334 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -63,12 +63,7 @@ except:
 print("checking out master branch")  # noqa: T201
 branch = repo.lookup_branch('master')
 if branch is None:
-    try:
+    ref = repo.lookup_reference('refs/remotes/origin/master')
        ref = repo.lookup_reference('refs/remotes/origin/master')
    except:
        print("pulling.")  # noqa: T201
        pull(repo)
        ref = repo.lookup_reference('refs/remotes/origin/master')
    repo.checkout(ref)
    branch = repo.lookup_branch('master')
    if branch is None:
--- a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@@ -1,2 +0,0 @@
 .\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
 pause
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -12,7 +12,7 @@ on:
        description: 'CUDA version'
        required: true
        type: string
-        default: "128"
+        default: "126"
      python_minor:
        description: 'Python minor version'
        required: true
@@ -22,7 +22,7 @@ on:
        description: 'Python patch version'
        required: true
        type: string
-        default: "10"
+        default: "8"
 jobs:
@@ -36,7 +36,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.git_tag }}
-          fetch-depth: 150
+          fetch-depth: 0
          persist-credentials: false
      - uses: actions/cache/restore@v4
        id: cache
@@ -70,7 +70,7 @@ jobs:
            cd ..
          git clone --depth 1 https://github.com/comfyanonymous/taesd
-          cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+          cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
          mkdir ComfyUI_windows_portable
          mv python_embeded ComfyUI_windows_portable
@@ -85,14 +85,12 @@ jobs:
          cd ..
-          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z
          cd ComfyUI_windows_portable
          python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu
          python_embeded/python.exe -s ./update/update.py ComfyUI/
          ls
      - name: Upload binaries to release
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -17,7 +17,7 @@ jobs:
        path: "ComfyUI"
    - uses: actions/setup-python@v4
      with:
-        python-version: '3.10'
+        python-version: '3.9'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
--- a/.github/workflows/update-api-stubs.yml
+++ b/.github/workflows/update-api-stubs.yml
@@ -1,56 +0,0 @@
 name: Generate Pydantic Stubs from api.comfy.org
 on:
  schedule:
    - cron: '0 0 * * 1'
  workflow_dispatch:
 jobs:
  generate-models:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.10'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install 'datamodel-code-generator[http]'
          npm install @redocly/cli
      - name: Download OpenAPI spec
        run: |
          curl -o openapi.yaml https://api.comfy.org/openapi
      - name: Filter OpenAPI spec with Redocly
        run: |
          npx @redocly/cli bundle openapi.yaml --output filtered-openapi.yaml --config comfy_api_nodes/redocly.yaml --remove-unused-components
      - name: Generate API models
        run: |
          datamodel-codegen --use-subclass-enum --input filtered-openapi.yaml --output comfy_api_nodes/apis --output-model-type pydantic_v2.BaseModel
      - name: Check for changes
        id: git-check
        run: |
          git diff --exit-code comfy_api_nodes/apis || echo "changes=true" >> $GITHUB_OUTPUT
      - name: Create Pull Request
        if: steps.git-check.outputs.changes == 'true'
        uses: peter-evans/create-pull-request@v5
        with:
          commit-message: 'chore: update API models from OpenAPI spec'
          title: 'Update API models from api.comfy.org'
          body: |
            This PR updates the API models based on the latest api.comfy.org OpenAPI specification.
            Generated automatically by the a Github workflow.
          branch: update-api-stubs
          delete-branch: true
          base: master
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -17,7 +17,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "126"
      python_minor:
        description: 'python minor version'
@@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "10"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "126"
      python_minor:
        description: 'python minor version'
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "2"
+        default: "1"
 #  push:
 #    branches:
 #      - master
@@ -34,7 +34,7 @@ jobs:
    steps:
        - uses: actions/checkout@v4
          with:
-            fetch-depth: 30
+            fetch-depth: 0
            persist-credentials: false
        - uses: actions/setup-python@v5
          with:
@@ -56,7 +56,7 @@ jobs:
            cd ..
            git clone --depth 1 https://github.com/comfyanonymous/taesd
-            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
            mkdir ComfyUI_windows_portable_nightly_pytorch
            mv python_embeded ComfyUI_windows_portable_nightly_pytorch
@@ -74,7 +74,7 @@ jobs:
            pause" > ./update/update_comfyui_and_python_dependencies.bat
            cd ..
-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
            mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z
            cd ComfyUI_windows_portable_nightly_pytorch
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "126"
      python_minor:
        description: 'python minor version'
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "10"
+        default: "8"
 #  push:
 #    branches:
 #      - master
@@ -50,7 +50,7 @@ jobs:
        - uses: actions/checkout@v4
          with:
-            fetch-depth: 150
+            fetch-depth: 0
            persist-credentials: false
        - shell: bash
          run: |
@@ -67,7 +67,7 @@ jobs:
            cd ..
            git clone --depth 1 https://github.com/comfyanonymous/taesd
-            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/
            mkdir ComfyUI_windows_portable
            mv python_embeded ComfyUI_windows_portable
@@ -82,14 +82,12 @@ jobs:
            cd ..
-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
            mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z
            cd ComfyUI_windows_portable
            python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu
            python_embeded/python.exe -s ./update/update.py ComfyUI/
            ls
        - name: Upload binaries to release
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,3 @@ venv/
 *.log
 web_custom_versions/
 .DS_Store
 openapi.yaml
 filtered-openapi.yaml
 uv.lock
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
 repos:
  - repo: https://github.com/charliermarsh/ruff-pre-commit
    rev: v0.0.241  # Use the desired version of Ruff
    hooks:
      - id: ruff
  - repo: local
    hooks:
      - id: pytest
        name: Run Pytest
        entry: pytest
        language: system
        types: [python] 
--- a/27
+++ b/27
@@ -5,20 +5,19 @@
 # Inlined the team members for now.
 # Maintainers
-*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/requirements.txt @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/pyproject.toml @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
 # Python web server
-/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
-/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
-/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
-# Node developers
+# Extra nodes
-/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
+/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink
 /comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
--- a/README.md
+++ b/README.md
@@ -49,6 +49,7 @@ Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon,
 ## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
 See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Image Models
@@ -61,7 +62,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
   - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
 - Video Models
   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
@@ -69,11 +69,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/)
   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
- Audio Models
+- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - 3D Models
   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@@ -100,22 +96,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
 Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)
 ## Release Process
 ComfyUI follows a weekly release cycle every Friday, with three interconnected repositories:
 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
   - Releases a new stable version (e.g., v0.7.0)
   - Serves as the foundation for the desktop release
 2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
   - Builds a new release using the latest stable core version
 3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
   - Weekly frontend updates are merged into the core repository
   - Features are frozen for the upcoming core release
   - Development continues for the next release cycle
 ## Shortcuts
 | Keybind                            | Explanation                                                                                                        |
@@ -166,6 +146,8 @@ Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you
 If you have trouble extracting it, right click the file -> properties -> unblock
 If you have a 50 series Blackwell card like a 5090 or 5080 see [this discussion thread](https://github.com/comfyanonymous/ComfyUI/discussions/6643)
 #### How do I share models between another UI and ComfyUI?
 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
@@ -197,11 +179,11 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4```
-This is the command to install the nightly with ROCm 6.4 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.3 which might have some performance improvements:
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3```
 ### Intel GPUs (Windows and Linux)
@@ -231,11 +213,11 @@ Additional discussion and help can be found [here](https://github.com/comfyanony
 Nvidia users should install stable pytorch using this command:
-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126```
-This is the command to install pytorch nightly instead which might have performance improvements.
+This is the command to install pytorch nightly instead which might have performance improvements:
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126```
 #### Troubleshooting
@@ -301,7 +283,7 @@ For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 pyt
 ### AMD ROCm Tips
-You can enable experimental memory efficient attention on recent pytorch in ComfyUI on some AMD GPUs using this command, it should already be enabled by default on RDNA3. If this improves speed for you on latest pytorch on your GPU please report it so that I can enable it by default.
+You can enable experimental memory efficient attention on pytorch 2.5 in ComfyUI on RDNA3 and potentially other AMD GPUs using this command:
 ```TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 python main.py --use-pytorch-cross-attention```
@@ -348,6 +330,25 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w
 See also: [https://www.comfy.org/](https://www.comfy.org/)
 ## ComfyUI Backend Development
 ### Setup Environment
 Install pre-commit to run tests and linters
 ```
 pip install pre-commit
 ```
 ```
 pre-commit install
 ```
 ### Reporting Issues and Requesting Features
 For any bugs, issues, or feature requests related to the backend, please use the [ComfyUI repository](https://github.com/comfyanonymous/ComfyUI). This will help us manage and address backend-specific concerns more efficiently.
 ## Frontend Development
 As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
--- a/app/app_settings.py
+++ b/app/app_settings.py
@@ -9,14 +9,8 @@ class AppSettings():
        self.user_manager = user_manager
    def get_settings(self, request):
-        try:
+        file = self.user_manager.get_request_user_filepath(
-            file = self.user_manager.get_request_user_filepath(
+            request, "comfy.settings.json")
                request,
                "comfy.settings.json"
            )
        except KeyError as e:
            logging.error("User settings not found.")
            raise web.HTTPUnauthorized() from e
        if os.path.isfile(file):
            try:
                with open(file) as f:
--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@@ -93,20 +93,16 @@ class CustomNodeManager:
    def add_routes(self, routes, webapp, loadedModules):
        example_workflow_folder_names = ["example_workflows", "example", "examples", "workflow", "workflows"]
        @routes.get("/workflow_templates")
        async def get_workflow_templates(request):
            """Returns a web response that contains the map of custom_nodes names and their associated workflow templates. The ones without templates are omitted."""
-
+            files = [
-            files = []
+                file
-
+                for folder in folder_paths.get_folder_paths("custom_nodes")
-            for folder in folder_paths.get_folder_paths("custom_nodes"):
+                for file in glob.glob(
-                for folder_name in example_workflow_folder_names:
+                    os.path.join(folder, "*/example_workflows/*.json")
-                    pattern = os.path.join(folder, f"*/{folder_name}/*.json")
+                )
-                    matched_files = glob.glob(pattern)
+            ]
                    files.extend(matched_files)
            workflow_templates_dict = (
                {}
            )  # custom_nodes folder name -> example workflow names
@@ -122,22 +118,15 @@ class CustomNodeManager:
        # Serve workflow templates from custom nodes.
        for module_name, module_dir in loadedModules:
-            for folder_name in example_workflow_folder_names:
+            workflows_dir = os.path.join(module_dir, "example_workflows")
-                workflows_dir = os.path.join(module_dir, folder_name)
+            if os.path.exists(workflows_dir):
-
+                webapp.add_routes(
-                if os.path.exists(workflows_dir):
+                    [
-                    if folder_name != "example_workflows":
+                        web.static(
-                        logging.debug(
+                            "/api/workflow_templates/" + module_name, workflows_dir
-                            "Found example workflow folder '%s' for custom node '%s', consider renaming it to 'example_workflows'",
+                        )
-                            folder_name, module_name)
+                    ]
-
+                )
                    webapp.add_routes(
                        [
                            web.static(
                                "/api/workflow_templates/" + module_name, workflows_dir
                            )
                        ]
                    )
        @routes.get("/i18n")
        async def get_i18n(request):
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -3,7 +3,6 @@ import argparse
 import logging
 import os
 import re
 import sys
 import tempfile
 import zipfile
 import importlib
@@ -11,61 +10,19 @@ from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import TypedDict, Optional
 from importlib.metadata import version
 import requests
 from typing_extensions import NotRequired
 from comfy.cli_args import DEFAULT_VERSION_STRING
 import app.logger
 # The path to the requirements.txt file
 req_path = Path(__file__).parents[1] / "requirements.txt"
-def frontend_install_warning_message():
+try:
-    """The warning message to display when the frontend version is not up to date."""
+    import comfyui_frontend_package
-
+except ImportError as e:
-    extra = ""
+    # TODO: Remove the check after roll out of 0.3.16
-    if sys.flags.no_user_site:
+    logging.error("comfyui-frontend-package is not installed. Please install the updated requirements.txt file by running: pip install -r requirements.txt")
-        extra = "-s "
+    raise e
    return f"""
 Please install the updated requirements.txt file by running:
 {sys.executable} {extra}-m pip install -r {req_path}
 This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
 If you are on the portable package you can run: update\\update_comfyui.bat to solve this problem
 """.strip()
 def check_frontend_version():
    """Check if the frontend version is up to date."""
    def parse_version(version: str) -> tuple[int, int, int]:
        return tuple(map(int, version.split(".")))
    try:
        frontend_version_str = version("comfyui-frontend-package")
        frontend_version = parse_version(frontend_version_str)
        with open(req_path, "r", encoding="utf-8") as f:
            required_frontend = parse_version(f.readline().split("=")[-1])
        if frontend_version < required_frontend:
            app.logger.log_startup_warning(
                f"""
 ________________________________________________________________________
 WARNING WARNING WARNING WARNING WARNING
 Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
 {frontend_install_warning_message()}
 ________________________________________________________________________
 """.strip()
            )
        else:
            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
    except Exception as e:
        logging.error(f"Failed to check frontend version: {e}")
 REQUEST_TIMEOUT = 10  # seconds
@@ -162,62 +119,9 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:
 class FrontendManager:
    DEFAULT_FRONTEND_PATH = str(importlib.resources.files(comfyui_frontend_package) / "static")
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
    @classmethod
    def default_frontend_path(cls) -> str:
        try:
            import comfyui_frontend_package
            return str(importlib.resources.files(comfyui_frontend_package) / "static")
        except ImportError:
            logging.error(
                f"""
 ********** ERROR ***********
 comfyui-frontend-package is not installed.
 {frontend_install_warning_message()}
 ********** ERROR ***********
 """.strip()
            )
            sys.exit(-1)
    @classmethod
    def templates_path(cls) -> str:
        try:
            import comfyui_workflow_templates
            return str(
                importlib.resources.files(comfyui_workflow_templates) / "templates"
            )
        except ImportError:
            logging.error(
                f"""
 ********** ERROR ***********
 comfyui-workflow-templates is not installed.
 {frontend_install_warning_message()}
 ********** ERROR ***********
 """.strip()
            )
    @classmethod
    def embedded_docs_path(cls) -> str:
        """Get the path to embedded documentation"""
        try:
            import comfyui_embedded_docs
            return str(
                importlib.resources.files(comfyui_embedded_docs) / "docs"
            )
        except ImportError:
            logging.info("comfyui-embedded-docs package not found")
            return None
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
@@ -238,9 +142,7 @@ comfyui-workflow-templates is not installed.
        return match_result.group(1), match_result.group(2), match_result.group(3)
    @classmethod
-    def init_frontend_unsafe(
+    def init_frontend_unsafe(cls, version_string: str, provider: Optional[FrontEndProvider] = None) -> str:
        cls, version_string: str, provider: Optional[FrontEndProvider] = None
    ) -> str:
        """
        Initializes the frontend for the specified version.
@@ -256,26 +158,17 @@ comfyui-workflow-templates is not installed.
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
-            check_frontend_version()
+            return cls.DEFAULT_FRONTEND_PATH
            return cls.default_frontend_path()
        repo_owner, repo_name, version = cls.parse_version_string(version_string)
        if version.startswith("v"):
-            expected_path = str(
+            expected_path = str(Path(cls.CUSTOM_FRONTENDS_ROOT) / f"{repo_owner}_{repo_name}" / version.lstrip("v"))
                Path(cls.CUSTOM_FRONTENDS_ROOT)
                / f"{repo_owner}_{repo_name}"
                / version.lstrip("v")
            )
            if os.path.exists(expected_path):
-                logging.info(
+                logging.info(f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}")
                    f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}"
                )
                return expected_path
-        logging.info(
+        logging.info(f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub...")
            f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub..."
        )
        provider = provider or FrontEndProvider(repo_owner, repo_name)
        release = provider.get_release(version)
@@ -318,5 +211,4 @@ comfyui-workflow-templates is not installed.
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
-            check_frontend_version()
+            return cls.DEFAULT_FRONTEND_PATH
            return cls.default_frontend_path()
--- a/app/logger.py
+++ b/app/logger.py
@@ -82,17 +82,3 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
        logger.addHandler(stdout_handler)
    logger.addHandler(stream_handler)
 STARTUP_WARNINGS = []
 def log_startup_warning(msg):
    logging.warning(msg)
    STARTUP_WARNINGS.append(msg)
 def print_startup_warnings():
    for s in STARTUP_WARNINGS:
        logging.warning(s)
    STARTUP_WARNINGS.clear()
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -197,112 +197,6 @@ class UserManager():
            return web.json_response(results)
        @routes.get("/v2/userdata")
        async def list_userdata_v2(request):
            """
            List files and directories in a user's data directory.
            This endpoint provides a structured listing of contents within a specified
            subdirectory of the user's data storage.
            Query Parameters:
            - path (optional): The relative path within the user's data directory
                               to list. Defaults to the root ('').
            Returns:
            - 400: If the requested path is invalid, outside the user's data directory, or is not a directory.
            - 404: If the requested path does not exist.
            - 403: If the user is invalid.
            - 500: If there is an error reading the directory contents.
            - 200: JSON response containing a list of file and directory objects.
                   Each object includes:
                   - name: The name of the file or directory.
                   - type: 'file' or 'directory'.
                   - path: The relative path from the user's data root.
                   - size (for files): The size in bytes.
                   - modified (for files): The last modified timestamp (Unix epoch).
            """
            requested_rel_path = request.rel_url.query.get('path', '')
            # URL-decode the path parameter
            try:
                requested_rel_path = parse.unquote(requested_rel_path)
            except Exception as e:
                logging.warning(f"Failed to decode path parameter: {requested_rel_path}, Error: {e}")
                return web.Response(status=400, text="Invalid characters in path parameter")
            # Check user validity and get the absolute path for the requested directory
            try:
                 base_user_path = self.get_request_user_filepath(request, None, create_dir=False)
                 if requested_rel_path:
                     target_abs_path = self.get_request_user_filepath(request, requested_rel_path, create_dir=False)
                 else:
                     target_abs_path = base_user_path
            except KeyError as e:
                 # Invalid user detected by get_request_user_id inside get_request_user_filepath
                 logging.warning(f"Access denied for user: {e}")
                 return web.Response(status=403, text="Invalid user specified in request")
            if not target_abs_path:
                 # Path traversal or other issue detected by get_request_user_filepath
                 return web.Response(status=400, text="Invalid path requested")
            # Handle cases where the user directory or target path doesn't exist
            if not os.path.exists(target_abs_path):
                # Check if it's the base user directory that's missing (new user case)
                if target_abs_path == base_user_path:
                    # It's okay if the base user directory doesn't exist yet, return empty list
                     return web.json_response([])
                else:
                    # A specific subdirectory was requested but doesn't exist
                     return web.Response(status=404, text="Requested path not found")
            if not os.path.isdir(target_abs_path):
                 return web.Response(status=400, text="Requested path is not a directory")
            results = []
            try:
                for root, dirs, files in os.walk(target_abs_path, topdown=True):
                    # Process directories
                    for dir_name in dirs:
                        dir_path = os.path.join(root, dir_name)
                        rel_path = os.path.relpath(dir_path, base_user_path).replace(os.sep, '/')
                        results.append({
                            "name": dir_name,
                            "path": rel_path,
                            "type": "directory"
                        })
                    # Process files
                    for file_name in files:
                        file_path = os.path.join(root, file_name)
                        rel_path = os.path.relpath(file_path, base_user_path).replace(os.sep, '/')
                        entry_info = {
                            "name": file_name,
                            "path": rel_path,
                            "type": "file"
                        }
                        try:
                            stats = os.stat(file_path) # Use os.stat for potentially better performance with os.walk
                            entry_info["size"] = stats.st_size
                            entry_info["modified"] = stats.st_mtime
                        except OSError as stat_error:
                            logging.warning(f"Could not stat file {file_path}: {stat_error}")
                            pass # Include file with available info
                        results.append(entry_info)
            except OSError as e:
                logging.error(f"Error listing directory {target_abs_path}: {e}")
                return web.Response(status=500, text="Error reading directory contents")
            # Sort results alphabetically, directories first then files
            results.sort(key=lambda x: (x['type'] != 'directory', x['name'].lower()))
            return web.json_response(results)
        def get_user_data_path(request, check_exists = False, param = "file"):
            file = request.match_info.get(param, None)
            if not file:
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,6 +1,7 @@
 import argparse
 import enum
 import os
 from typing import Optional
 import comfy.options
@@ -49,7 +50,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
 parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use.")
+parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
 cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
@@ -66,7 +67,6 @@ fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the diff
 fpunet_group.add_argument("--fp16-unet", action="store_true", help="Run the diffusion model in fp16")
 fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
 fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")
 fpunet_group.add_argument("--fp8_e8m0fnu-unet", action="store_true", help="Store unet weights in fp8_e8m0fnu.")
 fpvae_group = parser.add_mutually_exclusive_group()
 fpvae_group.add_argument("--fp16-vae", action="store_true", help="Run the VAE in fp16, might cause black images.")
@@ -80,7 +80,6 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor
 fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
 fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
 fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
 fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")
 parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
@@ -88,7 +87,6 @@ parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE"
 parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
 parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
 parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
 class LatentPreviewMethod(enum.Enum):
    NoPreviews = "none"
@@ -103,14 +101,12 @@ parser.add_argument("--preview-size", type=int, default=512, help="Sets the maxi
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
 attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
 attn_group.add_argument("--use-sage-attention", action="store_true", help="Use sage attention.")
 attn_group.add_argument("--use-flash-attention", action="store_true", help="Use FlashAttention.")
 parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")
@@ -129,7 +125,6 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e
 parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
 parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")
 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")
@@ -139,11 +134,8 @@ parser.add_argument("--deterministic", action="store_true", help="Make pytorch u
 class PerformanceFeature(enum.Enum):
    Fp16Accumulation = "fp16_accumulation"
    Fp8MatrixMultiplication = "fp8_matrix_mult"
    CublasOps = "cublas_ops"
-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult")
 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
@@ -151,7 +143,6 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win
 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
 parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")
 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
@@ -175,14 +166,13 @@ parser.add_argument(
    """,
 )
-def is_valid_directory(path: str) -> str:
+def is_valid_directory(path: Optional[str]) -> Optional[str]:
-    """Validate if the given path is a directory, and check permissions."""
+    """Validate if the given path is a directory."""
-    if not os.path.exists(path):
+    if path is None:
-        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
+        return None
    if not os.path.isdir(path):
-        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
+        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
    if not os.access(path, os.R_OK):
        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
    return path
 parser.add_argument(
@@ -196,13 +186,6 @@ parser.add_argument("--user-directory", type=is_valid_directory, default=None, h
 parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")
 parser.add_argument(
    "--comfy-api-base",
    type=str,
    default="https://api.comfy.org",
    help="Set the base URL for the ComfyUI API.  (default: https://api.comfy.org)",
 )
 if comfy.options.args_parsing:
    args = parser.parse_args()
 else:
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -97,12 +97,8 @@ class CLIPTextModel_(torch.nn.Module):
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
-        if embeds is not None:
+        x = self.embeddings(input_tokens, dtype=dtype)
            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
        else:
            x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -120,10 +116,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)
-        if num_tokens is not None:
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
        else:
            pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output
 class CLIPTextModel(torch.nn.Module):
@@ -211,15 +204,6 @@ class CLIPVision(torch.nn.Module):
            pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output
 class LlavaProjector(torch.nn.Module):
    def __init__(self, in_dim, out_dim, dtype, device, operations):
        super().__init__()
        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
    def forward(self, x):
        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
@@ -229,16 +213,7 @@ class CLIPVisionModelProjection(torch.nn.Module):
        else:
            self.visual_projection = lambda a: a
        if "llava3" == config_dict.get("projector_type", None):
            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
        else:
            self.multi_modal_projector = None
    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
        out = self.visual_projection(x[2])
-        projected = None
+        return (x[0], x[1], out)
        if self.multi_modal_projector is not None:
            projected = self.multi_modal_projector(x[1])
        return (x[0], x[1], out, projected)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -9,7 +9,6 @@ import comfy.model_patcher
 import comfy.model_management
 import comfy.utils
 import comfy.clip_model
 import comfy.image_encoders.dino2
 class Output:
    def __getitem__(self, key):
@@ -18,7 +17,6 @@ class Output:
        setattr(self, key, item)
 def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
    image = image[:, :, :, :3] if image.shape[3] > 3 else image
    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
    std = torch.tensor(std, device=image.device, dtype=image.dtype)
    image = image.movedim(-1, 1)
@@ -36,12 +34,6 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
    image = torch.clip((255. * image), 0, 255).round() / 255.0
    return (image - mean.view([3,1,1])) / std.view([3,1,1])
 IMAGE_ENCODERS = {
    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
 }
 class ClipVisionModel():
    def __init__(self, json_config):
        with open(json_config) as f:
@@ -50,11 +42,10 @@ class ClipVisionModel():
        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
+        self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
        self.model.eval()
        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
@@ -74,7 +65,6 @@ class ClipVisionModel():
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
        outputs["mm_projected"] = out[3]
        return outputs
 def convert_to_transformers(sd, prefix):
@@ -111,21 +101,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            if embed_shape == 729:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
-            elif embed_shape == 1024:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
        elif embed_shape == 577:
            if "multi_modal_projector.linear_1.bias" in sd:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
            else:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
    elif "embeddings.patch_embeddings.projection.weight" in sd:
        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
    else:
        return None
--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@@ -1,19 +0,0 @@
 {
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 336,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-5,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "projector_type": "llava3",
  "torch_dtype": "float32"
 }
--- a/comfy/clip_vision_siglip_512.json
+++ b/comfy/clip_vision_siglip_512.json
@@ -1,13 +0,0 @@
 {
  "num_channels": 3,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 1152,
  "image_size": 512,
  "intermediate_size": 4304,
  "model_type": "siglip_vision_model",
  "num_attention_heads": 16,
  "num_hidden_layers": 27,
  "patch_size": 16,
  "image_mean": [0.5, 0.5, 0.5],
  "image_std": [0.5, 0.5, 0.5]
 }
--- a/comfy/comfy_types/init.py
+++ b/comfy/comfy_types/init.py
@@ -1,6 +1,6 @@
 import torch
 from typing import Callable, Protocol, TypedDict, Optional, List
-from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin, FileLocator
+from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin
 class UnetApplyFunction(Protocol):
@@ -42,5 +42,4 @@ __all__ = [
    InputTypeDict.__name__,
    ComfyNodeABC.__name__,
    CheckLazyMixin.__name__,
    FileLocator.__name__,
 ]
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@@ -1,8 +1,7 @@
 """Comfy-specific type hinting"""
 from __future__ import annotations
-from typing import Literal, TypedDict, Optional
+from typing import Literal, TypedDict
 from typing_extensions import NotRequired
 from abc import ABC, abstractmethod
 from enum import Enum
@@ -27,7 +26,6 @@ class IO(StrEnum):
    BOOLEAN = "BOOLEAN"
    INT = "INT"
    FLOAT = "FLOAT"
    COMBO = "COMBO"
    CONDITIONING = "CONDITIONING"
    SAMPLER = "SAMPLER"
    SIGMAS = "SIGMAS"
@@ -48,7 +46,6 @@ class IO(StrEnum):
    FACE_ANALYSIS = "FACE_ANALYSIS"
    BBOX = "BBOX"
    SEGS = "SEGS"
    VIDEO = "VIDEO"
    ANY = "*"
    """Always matches any type, but at a price.
@@ -69,7 +66,6 @@ class IO(StrEnum):
        b = frozenset(value.split(","))
        return not (b.issubset(a) or a.issubset(b))
 class RemoteInputOptions(TypedDict):
    route: str
    """The route to the remote source."""
@@ -84,14 +80,6 @@ class RemoteInputOptions(TypedDict):
    refresh: int
    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
 class MultiSelectOptions(TypedDict):
    placeholder: NotRequired[str]
    """The placeholder text to display in the multi-select widget when no items are selected."""
    chip: NotRequired[bool]
    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
 class InputTypeOptions(TypedDict):
    """Provides type hinting for the return type of the INPUT_TYPES node function.
@@ -100,94 +88,66 @@ class InputTypeOptions(TypedDict):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
    """
-    default: NotRequired[bool | str | float | int | list | tuple]
+    default: bool | str | float | int | list | tuple
    """The default value of the widget"""
-    defaultInput: NotRequired[bool]
+    defaultInput: bool
-    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
+    """Defaults to an input slot rather than a widget"""
-    - defaultInput on required inputs should be dropped.
+    forceInput: bool
-    - defaultInput on optional inputs should be replaced with forceInput.
+    """`defaultInput` and also don't allow converting to a widget"""
-    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
+    lazy: bool
    """
    forceInput: NotRequired[bool]
    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
    lazy: NotRequired[bool]
    """Declares that this input uses lazy evaluation"""
-    rawLink: NotRequired[bool]
+    rawLink: bool
    """When a link exists, rather than receiving the evaluated value, you will receive the link (i.e. `["nodeId", <outputIndex>]`). Designed for node expansion."""
-    tooltip: NotRequired[str]
+    tooltip: str
    """Tooltip for the input (or widget), shown on pointer hover"""
    socketless: NotRequired[bool]
    """All inputs (including widgets) have an input socket to connect links. When ``true``, if there is a widget for this input, no socket will be created.
    Available from frontend v1.17.5
    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3548
    """
    widgetType: NotRequired[str]
    """Specifies a type to be used for widget initialization if different from the input type.
    Available from frontend v1.18.0
    https://github.com/Comfy-Org/ComfyUI_frontend/pull/3550"""
    # class InputTypeNumber(InputTypeOptions):
    # default: float | int
-    min: NotRequired[float]
+    min: float
    """The minimum value of a number (``FLOAT`` | ``INT``)"""
-    max: NotRequired[float]
+    max: float
    """The maximum value of a number (``FLOAT`` | ``INT``)"""
-    step: NotRequired[float]
+    step: float
    """The amount to increment or decrement a widget by when stepping up/down (``FLOAT`` | ``INT``)"""
-    round: NotRequired[float]
+    round: float
    """Floats are rounded by this value (``FLOAT``)"""
    # class InputTypeBoolean(InputTypeOptions):
    # default: bool
-    label_on: NotRequired[str]
+    label_on: str
    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
-    label_off: NotRequired[str]
+    label_on: str
    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
    # class InputTypeString(InputTypeOptions):
    # default: str
-    multiline: NotRequired[bool]
+    multiline: bool
    """Use a multiline text box (``STRING``)"""
-    placeholder: NotRequired[str]
+    placeholder: str
    """Placeholder text to display in the UI when empty (``STRING``)"""
    # Deprecated:
    # defaultVal: str
-    dynamicPrompts: NotRequired[bool]
+    dynamicPrompts: bool
    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
    # class InputTypeCombo(InputTypeOptions):
-    image_upload: NotRequired[bool]
+    image_upload: bool
    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
-    image_folder: NotRequired[Literal["input", "output", "temp"]]
+    image_folder: Literal["input", "output", "temp"]
    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
    """
-    remote: NotRequired[RemoteInputOptions]
+    remote: RemoteInputOptions
-    """Specifies the configuration for a remote input.
+    """Specifies the configuration for a remote input."""
    Available after ComfyUI frontend v1.9.7
    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
    control_after_generate: NotRequired[bool]
    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
    options: NotRequired[list[str | int | float]]
    """COMBO type only. Specifies the selectable options for the combo widget.
    Prefer:
    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
    Over:
    [["Option 1", "Option 2", "Option 3"]]
    """
    multi_select: NotRequired[MultiSelectOptions]
    """COMBO type only. Specifies the configuration for a multi-select widget.
    Available after ComfyUI frontend v1.13.4
    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""
 class HiddenInputTypeDict(TypedDict):
    """Provides type hinting for the hidden entry of node INPUT_TYPES."""
-    node_id: NotRequired[Literal["UNIQUE_ID"]]
+    node_id: Literal["UNIQUE_ID"]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    unique_id: NotRequired[Literal["UNIQUE_ID"]]
+    unique_id: Literal["UNIQUE_ID"]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    prompt: NotRequired[Literal["PROMPT"]]
+    prompt: Literal["PROMPT"]
    """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description."""
-    extra_pnginfo: NotRequired[Literal["EXTRA_PNGINFO"]]
+    extra_pnginfo: Literal["EXTRA_PNGINFO"]
    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
-    dynprompt: NotRequired[Literal["DYNPROMPT"]]
+    dynprompt: Literal["DYNPROMPT"]
    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""
@@ -197,11 +157,11 @@ class InputTypeDict(TypedDict):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
    """
-    required: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    required: dict[str, tuple[IO, InputTypeOptions]]
    """Describes all inputs that must be connected for the node to execute."""
-    optional: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    optional: dict[str, tuple[IO, InputTypeOptions]]
    """Describes inputs which do not need to be connected."""
-    hidden: NotRequired[HiddenInputTypeDict]
+    hidden: HiddenInputTypeDict
    """Offers advanced functionality and server-client communication.
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
@@ -234,8 +194,6 @@ class ComfyNodeABC(ABC):
    """Flags a node as experimental, informing users that it may change or not work as expected."""
    DEPRECATED: bool
    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
    API_NODE: Optional[bool]
    """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview."""
    @classmethod
    @abstractmethod
@@ -274,7 +232,7 @@ class ComfyNodeABC(ABC):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
    """
-    OUTPUT_IS_LIST: tuple[bool, ...]
+    OUTPUT_IS_LIST: tuple[bool]
    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.
    Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list.
@@ -293,7 +251,7 @@ class ComfyNodeABC(ABC):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
    """
-    RETURN_TYPES: tuple[IO, ...]
+    RETURN_TYPES: tuple[IO]
    """A tuple representing the outputs of this node.
    Usage::
@@ -302,12 +260,12 @@ class ComfyNodeABC(ABC):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
    """
-    RETURN_NAMES: tuple[str, ...]
+    RETURN_NAMES: tuple[str]
    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
    """
-    OUTPUT_TOOLTIPS: tuple[str, ...]
+    OUTPUT_TOOLTIPS: tuple[str]
    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
    FUNCTION: str
    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`
@@ -335,14 +293,3 @@ class CheckLazyMixin:
        need = [name for name in kwargs if kwargs[name] is None]
        return need
 class FileLocator(TypedDict):
    """Provides type hinting for the file location"""
    filename: str
    """The filename of the file."""
    subfolder: str
    """The subfolder of the file."""
    type: Literal["input", "output", "temp"]
    """The root folder of the file."""
--- a/comfy/conds.py
+++ b/comfy/conds.py
@@ -24,10 +24,6 @@ class CONDRegular:
            conds.append(x.cond)
        return torch.cat(conds)
    def size(self):
        return list(self.cond.size())
 class CONDNoiseShape(CONDRegular):
    def process_cond(self, batch_size, device, area, **kwargs):
        data = self.cond
@@ -68,7 +64,6 @@ class CONDCrossAttn(CONDRegular):
            out.append(c)
        return torch.cat(out)
 class CONDConstant(CONDRegular):
    def __init__(self, cond):
        self.cond = cond
@@ -83,48 +78,3 @@ class CONDConstant(CONDRegular):
    def concat(self, others):
        return self.cond
    def size(self):
        return [1]
 class CONDList(CONDRegular):
    def __init__(self, cond):
        self.cond = cond
    def process_cond(self, batch_size, device, **kwargs):
        out = []
        for c in self.cond:
            out.append(comfy.utils.repeat_to_batch_size(c, batch_size).to(device))
        return self._copy_with(out)
    def can_concat(self, other):
        if len(self.cond) != len(other.cond):
            return False
        for i in range(len(self.cond)):
            if self.cond[i].shape != other.cond[i].shape:
                return False
        return True
    def concat(self, others):
        out = []
        for i in range(len(self.cond)):
            o = [self.cond[i]]
            for x in others:
                o.append(x.cond[i])
            out.append(torch.cat(o))
        return out
    def size(self):  # hackish implementation to make the mem estimation work
        o = 0
        c = 1
        for c in self.cond:
            size = c.size()
            o += math.prod(size)
            if len(size) > 1:
                c = size[1]
        return [1, c, o // c]
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -15,14 +15,13 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
-from __future__ import annotations
+
 import torch
 from enum import Enum
 import math
 import os
 import logging
 import copy
 import comfy.utils
 import comfy.model_management
 import comfy.model_detection
@@ -37,7 +36,7 @@ import comfy.cldm.mmdit
 import comfy.ldm.hydit.controlnet
 import comfy.ldm.flux.controlnet
 import comfy.cldm.dit_embedder
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from comfy.hooks import HookGroup
@@ -64,18 +63,6 @@ class StrengthType(Enum):
    CONSTANT = 1
    LINEAR_UP = 2
 class ControlIsolation:
    '''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.'''
    def __init__(self, control: ControlBase):
        self.control = control
        self.orig_previous_controlnet = control.previous_controlnet
    def __enter__(self):
        self.control.previous_controlnet = None
    def __exit__(self, *args):
        self.control.previous_controlnet = self.orig_previous_controlnet
 class ControlBase:
    def __init__(self):
        self.cond_hint_original = None
@@ -89,7 +76,7 @@ class ControlBase:
        self.compression_ratio = 8
        self.upscale_algorithm = 'nearest-exact'
        self.extra_args = {}
-        self.previous_controlnet: Union[ControlBase, None] = None
+        self.previous_controlnet = None
        self.extra_conds = []
        self.strength_type = StrengthType.CONSTANT
        self.concat_mask = False
@@ -97,7 +84,6 @@ class ControlBase:
        self.extra_concat = None
        self.extra_hooks: HookGroup = None
        self.preprocess_image = lambda a: a
        self.multigpu_clones: dict[torch.device, ControlBase] = {}
    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
        self.cond_hint_original = cond_hint
@@ -124,38 +110,17 @@ class ControlBase:
    def cleanup(self):
        if self.previous_controlnet is not None:
            self.previous_controlnet.cleanup()
-        for device_cnet in self.multigpu_clones.values():
+
            with ControlIsolation(device_cnet):
                device_cnet.cleanup()
        self.cond_hint = None
        self.extra_concat = None
        self.timestep_range = None
    def get_models(self):
        out = []
        for device_cnet in self.multigpu_clones.values():
            out += device_cnet.get_models_only_self()
        if self.previous_controlnet is not None:
            out += self.previous_controlnet.get_models()
        return out
    def get_models_only_self(self):
        'Calls get_models, but temporarily sets previous_controlnet to None.'
        with ControlIsolation(self):
            return self.get_models()
    def get_instance_for_device(self, device):
        'Returns instance of this Control object intended for selected device.'
        return self.multigpu_clones.get(device, self)
    def deepclone_multigpu(self, load_device, autoregister=False):
        '''
        Create deep clone of Control object where model(s) is set to other devices.
        When autoregister is set to True, the deep clone is also added to multigpu_clones dict.
        '''
        raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.")
    def get_extra_hooks(self):
        out = []
        if self.extra_hooks is not None:
@@ -164,7 +129,7 @@ class ControlBase:
            out += self.previous_controlnet.get_extra_hooks()
        return out
-    def copy_to(self, c: ControlBase):
+    def copy_to(self, c):
        c.cond_hint_original = self.cond_hint_original
        c.strength = self.strength
        c.timestep_percent_range = self.timestep_percent_range
@@ -315,14 +280,6 @@ class ControlNet(ControlBase):
        self.copy_to(c)
        return c
    def deepclone_multigpu(self, load_device, autoregister=False):
        c = self.copy()
        c.control_model = copy.deepcopy(c.control_model)
        c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
        if autoregister:
            self.multigpu_clones[load_device] = c
        return c
    def get_models(self):
        out = super().get_models()
        out.append(self.control_model_wrapped)
@@ -779,7 +736,6 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
    return control
 def load_controlnet(ckpt_path, model=None, model_options={}):
    model_options = model_options.copy()
    if "global_average_pooling" not in model_options:
        filename = os.path.splitext(ckpt_path)[0]
        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
@@ -848,14 +804,6 @@ class T2IAdapter(ControlBase):
        self.copy_to(c)
        return c
    def deepclone_multigpu(self, load_device, autoregister=False):
        c = self.copy()
        c.t2i_model = copy.deepcopy(c.t2i_model)
        c.device = load_device
        if autoregister:
            self.multigpu_clones[load_device] = c
        return c
 def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
    compression_ratio = 8
    upscale_algorithm = 'nearest-exact'
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@@ -1,141 +0,0 @@
 import torch
 from comfy.text_encoders.bert import BertAttention
 import comfy.model_management
 from comfy.ldm.modules.attention import optimized_attention_for_device
 class Dino2AttentionOutput(torch.nn.Module):
    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
    def forward(self, x):
        return self.dense(x)
 class Dino2AttentionBlock(torch.nn.Module):
    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
    def forward(self, x, mask, optimized_attention):
        return self.output(self.attention(x, mask, optimized_attention))
 class LayerScale(torch.nn.Module):
    def __init__(self, dim, dtype, device, operations):
        super().__init__()
        self.lambda1 = torch.nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
    def forward(self, x):
        return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
 class SwiGLUFFN(torch.nn.Module):
    def __init__(self, dim, dtype, device, operations):
        super().__init__()
        in_features = out_features = dim
        hidden_features = int(dim * 4)
        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
        self.weights_in = operations.Linear(in_features, 2 * hidden_features, bias=True, device=device, dtype=dtype)
        self.weights_out = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
    def forward(self, x):
        x = self.weights_in(x)
        x1, x2 = x.chunk(2, dim=-1)
        x = torch.nn.functional.silu(x1) * x2
        return self.weights_out(x)
 class Dino2Block(torch.nn.Module):
    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
        super().__init__()
        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
    def forward(self, x, optimized_attention):
        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
        return x
 class Dino2Encoder(torch.nn.Module):
    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
        super().__init__()
        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
    def forward(self, x, intermediate_output=None):
        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
        if intermediate_output is not None:
            if intermediate_output < 0:
                intermediate_output = len(self.layer) + intermediate_output
        intermediate = None
        for i, l in enumerate(self.layer):
            x = l(x, optimized_attention)
            if i == intermediate_output:
                intermediate = x.clone()
        return x, intermediate
 class Dino2PatchEmbeddings(torch.nn.Module):
    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
        super().__init__()
        self.projection = operations.Conv2d(
            in_channels=num_channels,
            out_channels=dim,
            kernel_size=patch_size,
            stride=patch_size,
            bias=True,
            dtype=dtype,
            device=device
        )
    def forward(self, pixel_values):
        return self.projection(pixel_values).flatten(2).transpose(1, 2)
 class Dino2Embeddings(torch.nn.Module):
    def __init__(self, dim, dtype, device, operations):
        super().__init__()
        patch_size = 14
        image_size = 518
        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
    def forward(self, pixel_values):
        x = self.patch_embeddings(pixel_values)
        # TODO: mask_token?
        x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
        return x
 class Dinov2Model(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        num_layers = config_dict["num_hidden_layers"]
        dim = config_dict["hidden_size"]
        heads = config_dict["num_attention_heads"]
        layer_norm_eps = config_dict["layer_norm_eps"]
        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
        x = self.embeddings(pixel_values)
        x, i = self.encoder(x, intermediate_output=intermediate_output)
        x = self.layernorm(x)
        pooled_output = x[:, 0, :]
        return x, i, pooled_output, None
--- a/comfy/image_encoders/dino2_giant.json
+++ b/comfy/image_encoders/dino2_giant.json
@@ -1,21 +0,0 @@
 {
  "attention_probs_dropout_prob": 0.0,
  "drop_path_rate": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1536,
  "image_size": 518,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-06,
  "layerscale_value": 1.0,
  "mlp_ratio": 4,
  "model_type": "dinov2",
  "num_attention_heads": 24,
  "num_channels": 3,
  "num_hidden_layers": 40,
  "patch_size": 14,
  "qkv_bias": true,
  "use_swiglu_ffn": true,
  "image_mean": [0.485, 0.456, 0.406],
  "image_std": [0.229, 0.224, 0.225]
 }
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -688,10 +688,10 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
    if len(sigmas) <= 1:
        return x
    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    seed = extra_args.get("seed", None)
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@@ -762,10 +762,10 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if solver_type not in {'heun', 'midpoint'}:
        raise ValueError('solver_type must be \'heun\' or \'midpoint\'')
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    old_denoised = None
@@ -808,10 +808,10 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if len(sigmas) <= 1:
        return x
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    denoised_1, denoised_2 = None, None
@@ -858,7 +858,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
 def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
@@ -867,7 +867,7 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
@@ -876,7 +876,7 @@ def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
@@ -1277,7 +1277,6 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
    phi1_fn = lambda t: torch.expm1(t) / t
    phi2_fn = lambda t: (phi1_fn(t) - 1.0) / t
    old_sigma_down = None
    old_denoised = None
    uncond_denoised = None
    def post_cfg_function(args):
@@ -1305,9 +1304,9 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
                x = x + d * dt
        else:
            # Second order multistep method in https://arxiv.org/pdf/2308.02157
-            t, t_old, t_next, t_prev = t_fn(sigmas[i]), t_fn(old_sigma_down), t_fn(sigma_down), t_fn(sigmas[i - 1])
+            t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigma_down), t_fn(sigmas[i - 1])
            h = t_next - t
-            c2 = (t_prev - t_old) / h
+            c2 = (t_prev - t) / h
            phi1_val, phi2_val = phi1_fn(-h), phi2_fn(-h)
            b1 = torch.nan_to_num(phi1_val - phi2_val / c2, nan=0.0)
@@ -1327,7 +1326,6 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
            old_denoised = uncond_denoised
        else:
            old_denoised = denoised
        old_sigma_down = sigma_down
    return x
@torch.no_grad()
@@ -1347,202 +1345,24 @@ def sample_res_multistep_ancestral_cfg_pp(model, x, sigmas, extra_args=None, cal
    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=True)
@torch.no_grad()
-def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2., cfg_pp=False):
+def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.):
    """Gradient-estimation sampler. Paper: https://openreview.net/pdf?id=o2ND9v0CeK"""
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    old_d = None
    uncond_denoised = None
    def post_cfg_function(args):
        nonlocal uncond_denoised
        uncond_denoised = args["uncond_denoised"]
        return args["denoised"]
    if cfg_pp:
        model_options = extra_args.get("model_options", {}).copy()
        extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if cfg_pp:
+        d = to_d(x, sigmas[i], denoised)
            d = to_d(x, sigmas[i], uncond_denoised)
        else:
            d = to_d(x, sigmas[i], denoised)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        dt = sigmas[i + 1] - sigmas[i]
        if i == 0:
            # Euler method
-            if cfg_pp:
+            x = x + d * dt
                x = denoised + d * sigmas[i + 1]
            else:
                x = x + d * dt
        else:
            # Gradient estimation
-            if cfg_pp:
+            d_bar = ge_gamma * d + (1 - ge_gamma) * old_d
-                d_bar = (ge_gamma - 1) * (d - old_d)
+            x = x + d_bar * dt
                x = denoised + d * sigmas[i + 1] + d_bar * dt
            else:
                d_bar = ge_gamma * d + (1 - ge_gamma) * old_d
                x = x + d_bar * dt
        old_d = d
    return x
@torch.no_grad()
 def sample_gradient_estimation_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.):
    return sample_gradient_estimation(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, ge_gamma=ge_gamma, cfg_pp=True)
@torch.no_grad()
 def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, noise_scaler=None, max_stage=3):
    """
    Extended Reverse-Time SDE solver (VE ER-SDE-Solver-3). Arxiv: https://arxiv.org/abs/2309.06169.
    Code reference: https://github.com/QinpengCui/ER-SDE-Solver/blob/main/er_sde_solver.py.
    """
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    def default_noise_scaler(sigma):
        return sigma * ((sigma ** 0.3).exp() + 10.0)
    noise_scaler = default_noise_scaler if noise_scaler is None else noise_scaler
    num_integration_points = 200.0
    point_indice = torch.arange(0, num_integration_points, dtype=torch.float32, device=x.device)
    old_denoised = None
    old_denoised_d = None
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        stage_used = min(max_stage, i + 1)
        if sigmas[i + 1] == 0:
            x = denoised
        elif stage_used == 1:
            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
            x = r * x + (1 - r) * denoised
        else:
            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
            x = r * x + (1 - r) * denoised
            dt = sigmas[i + 1] - sigmas[i]
            sigma_step_size = -dt / num_integration_points
            sigma_pos = sigmas[i + 1] + point_indice * sigma_step_size
            scaled_pos = noise_scaler(sigma_pos)
            # Stage 2
            s = torch.sum(1 / scaled_pos) * sigma_step_size
            denoised_d = (denoised - old_denoised) / (sigmas[i] - sigmas[i - 1])
            x = x + (dt + s * noise_scaler(sigmas[i + 1])) * denoised_d
            if stage_used >= 3:
                # Stage 3
                s_u = torch.sum((sigma_pos - sigmas[i]) / scaled_pos) * sigma_step_size
                denoised_u = (denoised_d - old_denoised_d) / ((sigmas[i] - sigmas[i - 2]) / 2)
                x = x + ((dt ** 2) / 2 + s_u * noise_scaler(sigmas[i + 1])) * denoised_u
            old_denoised_d = denoised_d
        if s_noise != 0 and sigmas[i + 1] > 0:
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (sigmas[i + 1] ** 2 - sigmas[i] ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
        old_denoised = denoised
    return x
@torch.no_grad()
 def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
    '''
    SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 2
    Arxiv: https://arxiv.org/abs/2305.14267
    '''
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    inject_noise = eta > 0 and s_noise > 0
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
            x = denoised
        else:
            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
            h = t_next - t
            h_eta = h * (eta + 1)
            s = t + r * h
            fac = 1 / (2 * r)
            sigma_s = s.neg().exp()
            coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1()
            if inject_noise:
                noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt()
                noise_coeff_2 = ((-2 * r * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
                noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s), noise_sampler(sigma_s, sigmas[i + 1])
            # Step 1
            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
            if inject_noise:
                x_2 = x_2 + sigma_s * (noise_coeff_1 * noise_1) * s_noise
            denoised_2 = model(x_2, sigma_s * s_in, **extra_args)
            # Step 2
            denoised_d = (1 - fac) * denoised + fac * denoised_2
            x = (coeff_2 + 1) * x - coeff_2 * denoised_d
            if inject_noise:
                x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
    return x
@torch.no_grad()
 def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
    '''
    SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 3
    Arxiv: https://arxiv.org/abs/2305.14267
    '''
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    inject_noise = eta > 0 and s_noise > 0
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        if sigmas[i + 1] == 0:
            x = denoised
        else:
            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
            h = t_next - t
            h_eta = h * (eta + 1)
            s_1 = t + r_1 * h
            s_2 = t + r_2 * h
            sigma_s_1, sigma_s_2 = s_1.neg().exp(), s_2.neg().exp()
            coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1()
            if inject_noise:
                noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt()
                noise_coeff_2 = ((-2 * r_1 * h * eta).expm1() - (-2 * r_2 * h * eta).expm1()).sqrt()
                noise_coeff_3 = ((-2 * r_2 * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
                noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1])
            # Step 1
            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
            if inject_noise:
                x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise
            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
            # Step 2
            x_3 = (coeff_2 + 1) * x - coeff_2 * denoised + (r_2 / r_1) * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised)
            if inject_noise:
                x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
            denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args)
            # Step 3
            x = (coeff_3 + 1) * x - coeff_3 * denoised + (1. / r_2) * (coeff_3 / h_eta + 1) * (denoised_3 - denoised)
            if inject_noise:
                x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise
    return x
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -456,17 +456,3 @@ class Wan21(LatentFormat):
        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
        latents_std = self.latents_std.to(latent.device, latent.dtype)
        return latent * latents_std / self.scale_factor + latents_mean
 class Hunyuan3Dv2(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
    scale_factor = 0.9990943042622529
 class Hunyuan3Dv2mini(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
    scale_factor = 1.0188137142395404
 class ACEAudio(LatentFormat):
    latent_channels = 8
    latent_dimensions = 2
--- a/comfy/ldm/ace/attention.py
+++ b/comfy/ldm/ace/attention.py
@@ -1,761 +0,0 @@
 # Original from: https://github.com/ace-step/ACE-Step/blob/main/models/attention.py
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Tuple, Union, Optional
 import torch
 import torch.nn.functional as F
 from torch import nn
 import comfy.model_management
 from comfy.ldm.modules.attention import optimized_attention
 class Attention(nn.Module):
    def __init__(
        self,
        query_dim: int,
        cross_attention_dim: Optional[int] = None,
        heads: int = 8,
        kv_heads: Optional[int] = None,
        dim_head: int = 64,
        dropout: float = 0.0,
        bias: bool = False,
        qk_norm: Optional[str] = None,
        added_kv_proj_dim: Optional[int] = None,
        added_proj_bias: Optional[bool] = True,
        out_bias: bool = True,
        scale_qk: bool = True,
        only_cross_attention: bool = False,
        eps: float = 1e-5,
        rescale_output_factor: float = 1.0,
        residual_connection: bool = False,
        processor=None,
        out_dim: int = None,
        out_context_dim: int = None,
        context_pre_only=None,
        pre_only=False,
        elementwise_affine: bool = True,
        is_causal: bool = False,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
        self.query_dim = query_dim
        self.use_bias = bias
        self.is_cross_attention = cross_attention_dim is not None
        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
        self.rescale_output_factor = rescale_output_factor
        self.residual_connection = residual_connection
        self.dropout = dropout
        self.fused_projections = False
        self.out_dim = out_dim if out_dim is not None else query_dim
        self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
        self.context_pre_only = context_pre_only
        self.pre_only = pre_only
        self.is_causal = is_causal
        self.scale_qk = scale_qk
        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
        self.heads = out_dim // dim_head if out_dim is not None else heads
        # for slice_size > 0 the attention score computation
        # is split across the batch axis to save memory
        # You can set slice_size with `set_attention_slice`
        self.sliceable_head_dim = heads
        self.added_kv_proj_dim = added_kv_proj_dim
        self.only_cross_attention = only_cross_attention
        if self.added_kv_proj_dim is None and self.only_cross_attention:
            raise ValueError(
                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
            )
        self.group_norm = None
        self.spatial_norm = None
        self.norm_q = None
        self.norm_k = None
        self.norm_cross = None
        self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device)
        if not self.only_cross_attention:
            # only relevant for the `AddedKVProcessor` classes
            self.to_k = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
            self.to_v = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
        else:
            self.to_k = None
            self.to_v = None
        self.added_proj_bias = added_proj_bias
        if self.added_kv_proj_dim is not None:
            self.add_k_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
            self.add_v_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
            if self.context_pre_only is not None:
                self.add_q_proj = operations.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias, dtype=dtype, device=device)
        else:
            self.add_q_proj = None
            self.add_k_proj = None
            self.add_v_proj = None
        if not self.pre_only:
            self.to_out = nn.ModuleList([])
            self.to_out.append(operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device))
            self.to_out.append(nn.Dropout(dropout))
        else:
            self.to_out = None
        if self.context_pre_only is not None and not self.context_pre_only:
            self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device)
        else:
            self.to_add_out = None
        self.norm_added_q = None
        self.norm_added_k = None
        self.processor = processor
    def forward(
        self,
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        **cross_attention_kwargs,
    ) -> torch.Tensor:
        return self.processor(
            self,
            hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            attention_mask=attention_mask,
            **cross_attention_kwargs,
        )
 class CustomLiteLAProcessor2_0:
    """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE"""
    def __init__(self):
        self.kernel_func = nn.ReLU(inplace=False)
        self.eps = 1e-15
        self.pad_val = 1.0
    def apply_rotary_emb(
        self,
        x: torch.Tensor,
        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
        tensors contain rotary embeddings and are returned as real tensors.
        Args:
            x (`torch.Tensor`):
                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
        """
        cos, sin = freqs_cis  # [S, D]
        cos = cos[None, None]
        sin = sin[None, None]
        cos, sin = cos.to(x.device), sin.to(x.device)
        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
        return out
    def __call__(
        self,
        attn: Attention,
        hidden_states: torch.FloatTensor,
        encoder_hidden_states: torch.FloatTensor = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        *args,
        **kwargs,
    ) -> torch.FloatTensor:
        hidden_states_len = hidden_states.shape[1]
        input_ndim = hidden_states.ndim
        if input_ndim == 4:
            batch_size, channel, height, width = hidden_states.shape
            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
        if encoder_hidden_states is not None:
            context_input_ndim = encoder_hidden_states.ndim
            if context_input_ndim == 4:
                batch_size, channel, height, width = encoder_hidden_states.shape
                encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
        batch_size = hidden_states.shape[0]
        # `sample` projections.
        dtype = hidden_states.dtype
        query = attn.to_q(hidden_states)
        key = attn.to_k(hidden_states)
        value = attn.to_v(hidden_states)
        # `context` projections.
        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
        if encoder_hidden_states is not None and has_encoder_hidden_state_proj:
            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
            # attention
            if not attn.is_cross_attention:
                query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
                key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
                value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
            else:
                query = hidden_states
                key = encoder_hidden_states
                value = encoder_hidden_states
        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads
        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
        # RoPE需要 [B, H, S, D] 输入
        # 此时 query是 [B, H, D, S], 需要转成 [B, H, S, D] 才能应用RoPE
        query = query.permute(0, 1, 3, 2)  # [B, H, S, D]  (从 [B, H, D, S])
        # Apply query and key normalization if needed
        if attn.norm_q is not None:
            query = attn.norm_q(query)
        if attn.norm_k is not None:
            key = attn.norm_k(key)
        # Apply RoPE if needed
        if rotary_freqs_cis is not None:
            query = self.apply_rotary_emb(query, rotary_freqs_cis)
            if not attn.is_cross_attention:
                key = self.apply_rotary_emb(key, rotary_freqs_cis)
            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
        # 此时 query是 [B, H, S, D]，需要还原成 [B, H, D, S]
        query = query.permute(0, 1, 3, 2)  # [B, H, D, S]
        if attention_mask is not None:
            # attention_mask: [B, S] -> [B, 1, S, 1]
            attention_mask = attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S, 1]
            query = query * attention_mask.permute(0, 1, 3, 2)  # [B, H, S, D] * [B, 1, S, 1]
            if not attn.is_cross_attention:
                key = key * attention_mask  # key: [B, h, S, D] 与 mask [B, 1, S, 1] 相乘
                value = value * attention_mask.permute(0, 1, 3, 2)  # 如果 value 是 [B, h, D, S]，那么需调整mask以匹配S维度
        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
            encoder_attention_mask = encoder_attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S_enc, 1]
            # 此时 key: [B, h, S_enc, D], value: [B, h, D, S_enc]
            key = key * encoder_attention_mask  # [B, h, S_enc, D] * [B, 1, S_enc, 1]
            value = value * encoder_attention_mask.permute(0, 1, 3, 2)  # [B, h, D, S_enc] * [B, 1, 1, S_enc]
        query = self.kernel_func(query)
        key = self.kernel_func(key)
        query, key, value = query.float(), key.float(), value.float()
        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
        vk = torch.matmul(value, key)
        hidden_states = torch.matmul(vk, query)
        if hidden_states.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.float()
        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
        hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
        hidden_states = hidden_states.to(dtype)
        if encoder_hidden_states is not None:
            encoder_hidden_states = encoder_hidden_states.to(dtype)
        # Split the attention outputs.
        if encoder_hidden_states is not None and not attn.is_cross_attention and has_encoder_hidden_state_proj:
            hidden_states, encoder_hidden_states = (
                hidden_states[:, : hidden_states_len],
                hidden_states[:, hidden_states_len:],
            )
        # linear proj
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
        if encoder_hidden_states is not None and not attn.context_pre_only and not attn.is_cross_attention and hasattr(attn, "to_add_out"):
            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
        if input_ndim == 4:
            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
        if encoder_hidden_states is not None and context_input_ndim == 4:
            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
        if torch.get_autocast_gpu_dtype() == torch.float16:
            hidden_states = hidden_states.clip(-65504, 65504)
            if encoder_hidden_states is not None:
                encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
        return hidden_states, encoder_hidden_states
 class CustomerAttnProcessor2_0:
    r"""
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
    """
    def apply_rotary_emb(
        self,
        x: torch.Tensor,
        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
        tensors contain rotary embeddings and are returned as real tensors.
        Args:
            x (`torch.Tensor`):
                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
        """
        cos, sin = freqs_cis  # [S, D]
        cos = cos[None, None]
        sin = sin[None, None]
        cos, sin = cos.to(x.device), sin.to(x.device)
        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
        return out
    def __call__(
        self,
        attn: Attention,
        hidden_states: torch.FloatTensor,
        encoder_hidden_states: torch.FloatTensor = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        *args,
        **kwargs,
    ) -> torch.Tensor:
        residual = hidden_states
        input_ndim = hidden_states.ndim
        if input_ndim == 4:
            batch_size, channel, height, width = hidden_states.shape
            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
        batch_size, sequence_length, _ = (
            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
        )
        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
        query = attn.to_q(hidden_states)
        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
        key = attn.to_k(encoder_hidden_states)
        value = attn.to_v(encoder_hidden_states)
        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads
        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
        if attn.norm_q is not None:
            query = attn.norm_q(query)
        if attn.norm_k is not None:
            key = attn.norm_k(key)
        # Apply RoPE if needed
        if rotary_freqs_cis is not None:
            query = self.apply_rotary_emb(query, rotary_freqs_cis)
            if not attn.is_cross_attention:
                key = self.apply_rotary_emb(key, rotary_freqs_cis)
            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
            # attention_mask: N x S1
            # encoder_attention_mask: N x S2
            # cross attention 整合attention_mask和encoder_attention_mask
            combined_mask = attention_mask[:, :, None] * encoder_attention_mask[:, None, :]
            attention_mask = torch.where(combined_mask == 1, 0.0, -torch.inf)
            attention_mask = attention_mask[:, None, :, :].expand(-1, attn.heads, -1, -1).to(query.dtype)
        elif not attn.is_cross_attention and attention_mask is not None:
            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
            # scaled_dot_product_attention expects attention_mask shape to be
            # (batch, heads, source_length, target_length)
            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
        # the output of sdp = (batch, num_heads, seq_len, head_dim)
        hidden_states = optimized_attention(
            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True,
        ).to(query.dtype)
        # linear proj
        hidden_states = attn.to_out[0](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
        if input_ndim == 4:
            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
        if attn.residual_connection:
            hidden_states = hidden_states + residual
        hidden_states = hidden_states / attn.rescale_output_factor
        return hidden_states
 def val2list(x: list or tuple or any, repeat_time=1) -> list:  # type: ignore
    """Repeat `val` for `repeat_time` times and return the list or val if list/tuple."""
    if isinstance(x, (list, tuple)):
        return list(x)
    return [x for _ in range(repeat_time)]
 def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple:  # type: ignore
    """Return tuple with min_len by repeating element at idx_repeat."""
    # convert to list first
    x = val2list(x)
    # repeat elements if necessary
    if len(x) > 0:
        x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
    return tuple(x)
 def t2i_modulate(x, shift, scale):
    return x * (1 + scale) + shift
 def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]:
    if isinstance(kernel_size, tuple):
        return tuple([get_same_padding(ks) for ks in kernel_size])
    else:
        assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number"
        return kernel_size // 2
 class ConvLayer(nn.Module):
    def __init__(
        self,
        in_dim: int,
        out_dim: int,
        kernel_size=3,
        stride=1,
        dilation=1,
        groups=1,
        padding: Union[int, None] = None,
        use_bias=False,
        norm=None,
        act=None,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        if padding is None:
            padding = get_same_padding(kernel_size)
            padding *= dilation
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.kernel_size = kernel_size
        self.stride = stride
        self.dilation = dilation
        self.groups = groups
        self.padding = padding
        self.use_bias = use_bias
        self.conv = operations.Conv1d(
            in_dim,
            out_dim,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=use_bias,
            device=device,
            dtype=dtype
        )
        if norm is not None:
            self.norm = operations.RMSNorm(out_dim, elementwise_affine=False, dtype=dtype, device=device)
        else:
            self.norm = None
        if act is not None:
            self.act = nn.SiLU(inplace=True)
        else:
            self.act = None
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.conv(x)
        if self.norm:
            x = self.norm(x)
        if self.act:
            x = self.act(x)
        return x
 class GLUMBConv(nn.Module):
    def __init__(
        self,
        in_features: int,
        hidden_features: int,
        out_feature=None,
        kernel_size=3,
        stride=1,
        padding: Union[int, None] = None,
        use_bias=False,
        norm=(None, None, None),
        act=("silu", "silu", None),
        dilation=1,
        dtype=None, device=None, operations=None
    ):
        out_feature = out_feature or in_features
        super().__init__()
        use_bias = val2tuple(use_bias, 3)
        norm = val2tuple(norm, 3)
        act = val2tuple(act, 3)
        self.glu_act = nn.SiLU(inplace=False)
        self.inverted_conv = ConvLayer(
            in_features,
            hidden_features * 2,
            1,
            use_bias=use_bias[0],
            norm=norm[0],
            act=act[0],
            dtype=dtype,
            device=device,
            operations=operations,
        )
        self.depth_conv = ConvLayer(
            hidden_features * 2,
            hidden_features * 2,
            kernel_size,
            stride=stride,
            groups=hidden_features * 2,
            padding=padding,
            use_bias=use_bias[1],
            norm=norm[1],
            act=None,
            dilation=dilation,
            dtype=dtype,
            device=device,
            operations=operations,
        )
        self.point_conv = ConvLayer(
            hidden_features,
            out_feature,
            1,
            use_bias=use_bias[2],
            norm=norm[2],
            act=act[2],
            dtype=dtype,
            device=device,
            operations=operations,
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x.transpose(1, 2)
        x = self.inverted_conv(x)
        x = self.depth_conv(x)
        x, gate = torch.chunk(x, 2, dim=1)
        gate = self.glu_act(gate)
        x = x * gate
        x = self.point_conv(x)
        x = x.transpose(1, 2)
        return x
 class LinearTransformerBlock(nn.Module):
    """
    A Sana block with global shared adaptive layer norm (adaLN-single) conditioning.
    """
    def __init__(
        self,
        dim,
        num_attention_heads,
        attention_head_dim,
        use_adaln_single=True,
        cross_attention_dim=None,
        added_kv_proj_dim=None,
        context_pre_only=False,
        mlp_ratio=4.0,
        add_cross_attention=False,
        add_cross_attention_dim=None,
        qk_norm=None,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.norm1 = operations.RMSNorm(dim, elementwise_affine=False, eps=1e-6)
        self.attn = Attention(
            query_dim=dim,
            cross_attention_dim=cross_attention_dim,
            added_kv_proj_dim=added_kv_proj_dim,
            dim_head=attention_head_dim,
            heads=num_attention_heads,
            out_dim=dim,
            bias=True,
            qk_norm=qk_norm,
            processor=CustomLiteLAProcessor2_0(),
            dtype=dtype,
            device=device,
            operations=operations,
        )
        self.add_cross_attention = add_cross_attention
        self.context_pre_only = context_pre_only
        if add_cross_attention and add_cross_attention_dim is not None:
            self.cross_attn = Attention(
                query_dim=dim,
                cross_attention_dim=add_cross_attention_dim,
                added_kv_proj_dim=add_cross_attention_dim,
                dim_head=attention_head_dim,
                heads=num_attention_heads,
                out_dim=dim,
                context_pre_only=context_pre_only,
                bias=True,
                qk_norm=qk_norm,
                processor=CustomerAttnProcessor2_0(),
                dtype=dtype,
                device=device,
                operations=operations,
            )
        self.norm2 = operations.RMSNorm(dim, 1e-06, elementwise_affine=False)
        self.ff = GLUMBConv(
            in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            use_bias=(True, True, False),
            norm=(None, None, None),
            act=("silu", "silu", None),
            dtype=dtype,
            device=device,
            operations=operations,
        )
        self.use_adaln_single = use_adaln_single
        if use_adaln_single:
            self.scale_shift_table = nn.Parameter(torch.empty(6, dim, dtype=dtype, device=device))
    def forward(
        self,
        hidden_states: torch.FloatTensor,
        encoder_hidden_states: torch.FloatTensor = None,
        attention_mask: torch.FloatTensor = None,
        encoder_attention_mask: torch.FloatTensor = None,
        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        temb: torch.FloatTensor = None,
    ):
        N = hidden_states.shape[0]
        # step 1: AdaLN single
        if self.use_adaln_single:
            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
                comfy.model_management.cast_to(self.scale_shift_table[None], dtype=temb.dtype, device=temb.device) + temb.reshape(N, 6, -1)
            ).chunk(6, dim=1)
        norm_hidden_states = self.norm1(hidden_states)
        if self.use_adaln_single:
            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
        # step 2: attention
        if not self.add_cross_attention:
            attn_output, encoder_hidden_states = self.attn(
                hidden_states=norm_hidden_states,
                attention_mask=attention_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
            )
        else:
            attn_output, _ = self.attn(
                hidden_states=norm_hidden_states,
                attention_mask=attention_mask,
                encoder_hidden_states=None,
                encoder_attention_mask=None,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=None,
            )
        if self.use_adaln_single:
            attn_output = gate_msa * attn_output
        hidden_states = attn_output + hidden_states
        if self.add_cross_attention:
            attn_output = self.cross_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
            )
            hidden_states = attn_output + hidden_states
        # step 3: add norm
        norm_hidden_states = self.norm2(hidden_states)
        if self.use_adaln_single:
            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
        # step 4: feed forward
        ff_output = self.ff(norm_hidden_states)
        if self.use_adaln_single:
            ff_output = gate_mlp * ff_output
        hidden_states = hidden_states + ff_output
        return hidden_states
--- a/comfy/ldm/ace/lyric_encoder.py
+++ b/comfy/ldm/ace/lyric_encoder.py
--- a/comfy/ldm/ace/model.py
+++ b/comfy/ldm/ace/model.py
@@ -1,385 +0,0 @@
 # Original from: https://github.com/ace-step/ACE-Step/blob/main/models/ace_step_transformer.py
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional, List, Union
 import torch
 from torch import nn
 import comfy.model_management
 from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
 from .attention import LinearTransformerBlock, t2i_modulate
 from .lyric_encoder import ConformerEncoder as LyricEncoder
 def cross_norm(hidden_states, controlnet_input):
    # input N x T x c
    mean_hidden_states, std_hidden_states = hidden_states.mean(dim=(1,2), keepdim=True), hidden_states.std(dim=(1,2), keepdim=True)
    mean_controlnet_input, std_controlnet_input = controlnet_input.mean(dim=(1,2), keepdim=True), controlnet_input.std(dim=(1,2), keepdim=True)
    controlnet_input = (controlnet_input - mean_controlnet_input) * (std_hidden_states / (std_controlnet_input + 1e-12)) + mean_hidden_states
    return controlnet_input
 # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
 class Qwen2RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, dtype=None, device=None):
        super().__init__()
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=device).float() / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32
        )
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )
 class T2IFinalLayer(nn.Module):
    """
    The final layer of Sana.
    """
    def __init__(self, hidden_size, patch_size=[16, 1], out_channels=256, dtype=None, device=None, operations=None):
        super().__init__()
        self.norm_final = operations.RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.linear = operations.Linear(hidden_size, patch_size[0] * patch_size[1] * out_channels, bias=True, dtype=dtype, device=device)
        self.scale_shift_table = nn.Parameter(torch.empty(2, hidden_size, dtype=dtype, device=device))
        self.out_channels = out_channels
        self.patch_size = patch_size
    def unpatchfy(
        self,
        hidden_states: torch.Tensor,
        width: int,
    ):
        # 4 unpatchify
        new_height, new_width = 1, hidden_states.size(1)
        hidden_states = hidden_states.reshape(
            shape=(hidden_states.shape[0], new_height, new_width, self.patch_size[0], self.patch_size[1], self.out_channels)
        ).contiguous()
        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
        output = hidden_states.reshape(
            shape=(hidden_states.shape[0], self.out_channels, new_height * self.patch_size[0], new_width * self.patch_size[1])
        ).contiguous()
        if width > new_width:
            output = torch.nn.functional.pad(output, (0, width - new_width, 0, 0), 'constant', 0)
        elif width < new_width:
            output = output[:, :, :, :width]
        return output
    def forward(self, x, t, output_length):
        shift, scale = (comfy.model_management.cast_to(self.scale_shift_table[None], device=t.device, dtype=t.dtype) + t[:, None]).chunk(2, dim=1)
        x = t2i_modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        # unpatchify
        output = self.unpatchfy(x, output_length)
        return output
 class PatchEmbed(nn.Module):
    """2D Image to Patch Embedding"""
    def __init__(
        self,
        height=16,
        width=4096,
        patch_size=(16, 1),
        in_channels=8,
        embed_dim=1152,
        bias=True,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        patch_size_h, patch_size_w = patch_size
        self.early_conv_layers = nn.Sequential(
            operations.Conv2d(in_channels, in_channels*256, kernel_size=patch_size, stride=patch_size, padding=0, bias=bias, dtype=dtype, device=device),
            operations.GroupNorm(num_groups=32, num_channels=in_channels*256, eps=1e-6, affine=True, dtype=dtype, device=device),
            operations.Conv2d(in_channels*256, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias, dtype=dtype, device=device)
        )
        self.patch_size = patch_size
        self.height, self.width = height // patch_size_h, width // patch_size_w
        self.base_size = self.width
    def forward(self, latent):
        # early convolutions, N x C x H x W -> N x 256 * sqrt(patch_size) x H/patch_size x W/patch_size
        latent = self.early_conv_layers(latent)
        latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
        return latent
 class ACEStepTransformer2DModel(nn.Module):
    # _supports_gradient_checkpointing = True
    def __init__(
        self,
        in_channels: Optional[int] = 8,
        num_layers: int = 28,
        inner_dim: int = 1536,
        attention_head_dim: int = 64,
        num_attention_heads: int = 24,
        mlp_ratio: float = 4.0,
        out_channels: int = 8,
        max_position: int = 32768,
        rope_theta: float = 1000000.0,
        speaker_embedding_dim: int = 512,
        text_embedding_dim: int = 768,
        ssl_encoder_depths: List[int] = [9, 9],
        ssl_names: List[str] = ["mert", "m-hubert"],
        ssl_latent_dims: List[int] = [1024, 768],
        lyric_encoder_vocab_size: int = 6681,
        lyric_hidden_size: int = 1024,
        patch_size: List[int] = [16, 1],
        max_height: int = 16,
        max_width: int = 4096,
        audio_model=None,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.dtype = dtype
        self.num_attention_heads = num_attention_heads
        self.attention_head_dim = attention_head_dim
        inner_dim = num_attention_heads * attention_head_dim
        self.inner_dim = inner_dim
        self.out_channels = out_channels
        self.max_position = max_position
        self.patch_size = patch_size
        self.rope_theta = rope_theta
        self.rotary_emb = Qwen2RotaryEmbedding(
            dim=self.attention_head_dim,
            max_position_embeddings=self.max_position,
            base=self.rope_theta,
            dtype=dtype,
            device=device,
        )
        # 2. Define input layers
        self.in_channels = in_channels
        self.num_layers = num_layers
        # 3. Define transformers blocks
        self.transformer_blocks = nn.ModuleList(
            [
                LinearTransformerBlock(
                    dim=self.inner_dim,
                    num_attention_heads=self.num_attention_heads,
                    attention_head_dim=attention_head_dim,
                    mlp_ratio=mlp_ratio,
                    add_cross_attention=True,
                    add_cross_attention_dim=self.inner_dim,
                    dtype=dtype,
                    device=device,
                    operations=operations,
                )
                for i in range(self.num_layers)
            ]
        )
        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim, dtype=dtype, device=device, operations=operations)
        self.t_block = nn.Sequential(nn.SiLU(), operations.Linear(self.inner_dim, 6 * self.inner_dim, bias=True, dtype=dtype, device=device))
        # speaker
        self.speaker_embedder = operations.Linear(speaker_embedding_dim, self.inner_dim, dtype=dtype, device=device)
        # genre
        self.genre_embedder = operations.Linear(text_embedding_dim, self.inner_dim, dtype=dtype, device=device)
        # lyric
        self.lyric_embs = operations.Embedding(lyric_encoder_vocab_size, lyric_hidden_size, dtype=dtype, device=device)
        self.lyric_encoder = LyricEncoder(input_size=lyric_hidden_size, static_chunk_size=0, dtype=dtype, device=device, operations=operations)
        self.lyric_proj = operations.Linear(lyric_hidden_size, self.inner_dim, dtype=dtype, device=device)
        projector_dim = 2 * self.inner_dim
        self.projectors = nn.ModuleList([
            nn.Sequential(
                operations.Linear(self.inner_dim, projector_dim, dtype=dtype, device=device),
                nn.SiLU(),
                operations.Linear(projector_dim, projector_dim, dtype=dtype, device=device),
                nn.SiLU(),
                operations.Linear(projector_dim, ssl_dim, dtype=dtype, device=device),
            ) for ssl_dim in ssl_latent_dims
        ])
        self.proj_in = PatchEmbed(
            height=max_height,
            width=max_width,
            patch_size=patch_size,
            embed_dim=self.inner_dim,
            bias=True,
            dtype=dtype,
            device=device,
            operations=operations,
        )
        self.final_layer = T2IFinalLayer(self.inner_dim, patch_size=patch_size, out_channels=out_channels, dtype=dtype, device=device, operations=operations)
    def forward_lyric_encoder(
        self,
        lyric_token_idx: Optional[torch.LongTensor] = None,
        lyric_mask: Optional[torch.LongTensor] = None,
        out_dtype=None,
    ):
        # N x T x D
        lyric_embs = self.lyric_embs(lyric_token_idx, out_dtype=out_dtype)
        prompt_prenet_out, _mask = self.lyric_encoder(lyric_embs, lyric_mask, decoding_chunk_size=1, num_decoding_left_chunks=-1)
        prompt_prenet_out = self.lyric_proj(prompt_prenet_out)
        return prompt_prenet_out
    def encode(
        self,
        encoder_text_hidden_states: Optional[torch.Tensor] = None,
        text_attention_mask: Optional[torch.LongTensor] = None,
        speaker_embeds: Optional[torch.FloatTensor] = None,
        lyric_token_idx: Optional[torch.LongTensor] = None,
        lyric_mask: Optional[torch.LongTensor] = None,
        lyrics_strength=1.0,
    ):
        bs = encoder_text_hidden_states.shape[0]
        device = encoder_text_hidden_states.device
        # speaker embedding
        encoder_spk_hidden_states = self.speaker_embedder(speaker_embeds).unsqueeze(1)
        # genre embedding
        encoder_text_hidden_states = self.genre_embedder(encoder_text_hidden_states)
        # lyric
        encoder_lyric_hidden_states = self.forward_lyric_encoder(
            lyric_token_idx=lyric_token_idx,
            lyric_mask=lyric_mask,
            out_dtype=encoder_text_hidden_states.dtype,
        )
        encoder_lyric_hidden_states *= lyrics_strength
        encoder_hidden_states = torch.cat([encoder_spk_hidden_states, encoder_text_hidden_states, encoder_lyric_hidden_states], dim=1)
        encoder_hidden_mask = None
        if text_attention_mask is not None:
            speaker_mask = torch.ones(bs, 1, device=device)
            encoder_hidden_mask = torch.cat([speaker_mask, text_attention_mask, lyric_mask], dim=1)
        return encoder_hidden_states, encoder_hidden_mask
    def decode(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
        encoder_hidden_mask: torch.Tensor,
        timestep: Optional[torch.Tensor],
        output_length: int = 0,
        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
        controlnet_scale: Union[float, torch.Tensor] = 1.0,
    ):
        embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
        temb = self.t_block(embedded_timestep)
        hidden_states = self.proj_in(hidden_states)
        # controlnet logic
        if block_controlnet_hidden_states is not None:
            control_condi = cross_norm(hidden_states, block_controlnet_hidden_states)
            hidden_states = hidden_states + control_condi * controlnet_scale
        # inner_hidden_states = []
        rotary_freqs_cis = self.rotary_emb(hidden_states, seq_len=hidden_states.shape[1])
        encoder_rotary_freqs_cis = self.rotary_emb(encoder_hidden_states, seq_len=encoder_hidden_states.shape[1])
        for index_block, block in enumerate(self.transformer_blocks):
            hidden_states = block(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_hidden_mask,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
                temb=temb,
            )
        output = self.final_layer(hidden_states, embedded_timestep, output_length)
        return output
    def forward(
        self,
        x,
        timestep,
        attention_mask=None,
        context: Optional[torch.Tensor] = None,
        text_attention_mask: Optional[torch.LongTensor] = None,
        speaker_embeds: Optional[torch.FloatTensor] = None,
        lyric_token_idx: Optional[torch.LongTensor] = None,
        lyric_mask: Optional[torch.LongTensor] = None,
        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
        controlnet_scale: Union[float, torch.Tensor] = 1.0,
        lyrics_strength=1.0,
        **kwargs
    ):
        hidden_states = x
        encoder_text_hidden_states = context
        encoder_hidden_states, encoder_hidden_mask = self.encode(
            encoder_text_hidden_states=encoder_text_hidden_states,
            text_attention_mask=text_attention_mask,
            speaker_embeds=speaker_embeds,
            lyric_token_idx=lyric_token_idx,
            lyric_mask=lyric_mask,
            lyrics_strength=lyrics_strength,
        )
        output_length = hidden_states.shape[-1]
        output = self.decode(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_hidden_mask=encoder_hidden_mask,
            timestep=timestep,
            output_length=output_length,
            block_controlnet_hidden_states=block_controlnet_hidden_states,
            controlnet_scale=controlnet_scale,
        )
        return output
--- a/comfy/ldm/ace/vae/autoencoder_dc.py
+++ b/comfy/ldm/ace/vae/autoencoder_dc.py
@@ -1,644 +0,0 @@
 # Rewritten from diffusers
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Tuple, Union
 import comfy.model_management
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 class RMSNorm(ops.RMSNorm):
    def __init__(self, dim, eps=1e-5, elementwise_affine=True, bias=False):
        super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine)
        if elementwise_affine:
            self.bias = nn.Parameter(torch.empty(dim)) if bias else None
    def forward(self, x):
        x = super().forward(x)
        if self.elementwise_affine:
            if self.bias is not None:
                x = x + comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device)
        return x
 def get_normalization(norm_type, num_features, num_groups=32, eps=1e-5):
    if norm_type == "batch_norm":
        return nn.BatchNorm2d(num_features)
    elif norm_type == "group_norm":
        return ops.GroupNorm(num_groups, num_features)
    elif norm_type == "layer_norm":
        return ops.LayerNorm(num_features)
    elif norm_type == "rms_norm":
        return RMSNorm(num_features, eps=eps, elementwise_affine=True, bias=True)
    else:
        raise ValueError(f"Unknown normalization type: {norm_type}")
 def get_activation(activation_type):
    if activation_type == "relu":
        return nn.ReLU()
    elif activation_type == "relu6":
        return nn.ReLU6()
    elif activation_type == "silu":
        return nn.SiLU()
    elif activation_type == "leaky_relu":
        return nn.LeakyReLU(0.2)
    else:
        raise ValueError(f"Unknown activation type: {activation_type}")
 class ResBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        norm_type: str = "batch_norm",
        act_fn: str = "relu6",
    ) -> None:
        super().__init__()
        self.norm_type = norm_type
        self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
        self.conv1 = ops.Conv2d(in_channels, in_channels, 3, 1, 1)
        self.conv2 = ops.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
        self.norm = get_normalization(norm_type, out_channels)
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        residual = hidden_states
        hidden_states = self.conv1(hidden_states)
        hidden_states = self.nonlinearity(hidden_states)
        hidden_states = self.conv2(hidden_states)
        if self.norm_type == "rms_norm":
            # move channel to the last dimension so we apply RMSnorm across channel dimension
            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
        else:
            hidden_states = self.norm(hidden_states)
        return hidden_states + residual
 class SanaMultiscaleAttentionProjection(nn.Module):
    def __init__(
        self,
        in_channels: int,
        num_attention_heads: int,
        kernel_size: int,
    ) -> None:
        super().__init__()
        channels = 3 * in_channels
        self.proj_in = ops.Conv2d(
            channels,
            channels,
            kernel_size,
            padding=kernel_size // 2,
            groups=channels,
            bias=False,
        )
        self.proj_out = ops.Conv2d(channels, channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False)
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.proj_in(hidden_states)
        hidden_states = self.proj_out(hidden_states)
        return hidden_states
 class SanaMultiscaleLinearAttention(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        num_attention_heads: int = None,
        attention_head_dim: int = 8,
        mult: float = 1.0,
        norm_type: str = "batch_norm",
        kernel_sizes: tuple = (5,),
        eps: float = 1e-15,
        residual_connection: bool = False,
    ):
        super().__init__()
        self.eps = eps
        self.attention_head_dim = attention_head_dim
        self.norm_type = norm_type
        self.residual_connection = residual_connection
        num_attention_heads = (
            int(in_channels // attention_head_dim * mult)
            if num_attention_heads is None
            else num_attention_heads
        )
        inner_dim = num_attention_heads * attention_head_dim
        self.to_q = ops.Linear(in_channels, inner_dim, bias=False)
        self.to_k = ops.Linear(in_channels, inner_dim, bias=False)
        self.to_v = ops.Linear(in_channels, inner_dim, bias=False)
        self.to_qkv_multiscale = nn.ModuleList()
        for kernel_size in kernel_sizes:
            self.to_qkv_multiscale.append(
                SanaMultiscaleAttentionProjection(inner_dim, num_attention_heads, kernel_size)
            )
        self.nonlinearity = nn.ReLU()
        self.to_out = ops.Linear(inner_dim * (1 + len(kernel_sizes)), out_channels, bias=False)
        self.norm_out = get_normalization(norm_type, out_channels)
    def apply_linear_attention(self, query, key, value):
        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1)
        scores = torch.matmul(value, key.transpose(-1, -2))
        hidden_states = torch.matmul(scores, query)
        hidden_states = hidden_states.to(dtype=torch.float32)
        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
        return hidden_states
    def apply_quadratic_attention(self, query, key, value):
        scores = torch.matmul(key.transpose(-1, -2), query)
        scores = scores.to(dtype=torch.float32)
        scores = scores / (torch.sum(scores, dim=2, keepdim=True) + self.eps)
        hidden_states = torch.matmul(value, scores.to(value.dtype))
        return hidden_states
    def forward(self, hidden_states):
        height, width = hidden_states.shape[-2:]
        if height * width > self.attention_head_dim:
            use_linear_attention = True
        else:
            use_linear_attention = False
        residual = hidden_states
        batch_size, _, height, width = list(hidden_states.size())
        original_dtype = hidden_states.dtype
        hidden_states = hidden_states.movedim(1, -1)
        query = self.to_q(hidden_states)
        key = self.to_k(hidden_states)
        value = self.to_v(hidden_states)
        hidden_states = torch.cat([query, key, value], dim=3)
        hidden_states = hidden_states.movedim(-1, 1)
        multi_scale_qkv = [hidden_states]
        for block in self.to_qkv_multiscale:
            multi_scale_qkv.append(block(hidden_states))
        hidden_states = torch.cat(multi_scale_qkv, dim=1)
        if use_linear_attention:
            # for linear attention upcast hidden_states to float32
            hidden_states = hidden_states.to(dtype=torch.float32)
        hidden_states = hidden_states.reshape(batch_size, -1, 3 * self.attention_head_dim, height * width)
        query, key, value = hidden_states.chunk(3, dim=2)
        query = self.nonlinearity(query)
        key = self.nonlinearity(key)
        if use_linear_attention:
            hidden_states = self.apply_linear_attention(query, key, value)
            hidden_states = hidden_states.to(dtype=original_dtype)
        else:
            hidden_states = self.apply_quadratic_attention(query, key, value)
        hidden_states = torch.reshape(hidden_states, (batch_size, -1, height, width))
        hidden_states = self.to_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
        if self.norm_type == "rms_norm":
            hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
        else:
            hidden_states = self.norm_out(hidden_states)
        if self.residual_connection:
            hidden_states = hidden_states + residual
        return hidden_states
 class EfficientViTBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        mult: float = 1.0,
        attention_head_dim: int = 32,
        qkv_multiscales: tuple = (5,),
        norm_type: str = "batch_norm",
    ) -> None:
        super().__init__()
        self.attn = SanaMultiscaleLinearAttention(
            in_channels=in_channels,
            out_channels=in_channels,
            mult=mult,
            attention_head_dim=attention_head_dim,
            norm_type=norm_type,
            kernel_sizes=qkv_multiscales,
            residual_connection=True,
        )
        self.conv_out = GLUMBConv(
            in_channels=in_channels,
            out_channels=in_channels,
            norm_type="rms_norm",
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.attn(x)
        x = self.conv_out(x)
        return x
 class GLUMBConv(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        expand_ratio: float = 4,
        norm_type: str = None,
        residual_connection: bool = True,
    ) -> None:
        super().__init__()
        hidden_channels = int(expand_ratio * in_channels)
        self.norm_type = norm_type
        self.residual_connection = residual_connection
        self.nonlinearity = nn.SiLU()
        self.conv_inverted = ops.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
        self.conv_depth = ops.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
        self.conv_point = ops.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
        self.norm = None
        if norm_type == "rms_norm":
            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        if self.residual_connection:
            residual = hidden_states
        hidden_states = self.conv_inverted(hidden_states)
        hidden_states = self.nonlinearity(hidden_states)
        hidden_states = self.conv_depth(hidden_states)
        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
        hidden_states = hidden_states * self.nonlinearity(gate)
        hidden_states = self.conv_point(hidden_states)
        if self.norm_type == "rms_norm":
            # move channel to the last dimension so we apply RMSnorm across channel dimension
            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
        if self.residual_connection:
            hidden_states = hidden_states + residual
        return hidden_states
 def get_block(
    block_type: str,
    in_channels: int,
    out_channels: int,
    attention_head_dim: int,
    norm_type: str,
    act_fn: str,
    qkv_mutliscales: tuple = (),
 ):
    if block_type == "ResBlock":
        block = ResBlock(in_channels, out_channels, norm_type, act_fn)
    elif block_type == "EfficientViTBlock":
        block = EfficientViTBlock(
            in_channels,
            attention_head_dim=attention_head_dim,
            norm_type=norm_type,
            qkv_multiscales=qkv_mutliscales
        )
    else:
        raise ValueError(f"Block with {block_type=} is not supported.")
    return block
 class DCDownBlock2d(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
        super().__init__()
        self.downsample = downsample
        self.factor = 2
        self.stride = 1 if downsample else 2
        self.group_size = in_channels * self.factor**2 // out_channels
        self.shortcut = shortcut
        out_ratio = self.factor**2
        if downsample:
            assert out_channels % out_ratio == 0
            out_channels = out_channels // out_ratio
        self.conv = ops.Conv2d(
            in_channels,
            out_channels,
            kernel_size=3,
            stride=self.stride,
            padding=1,
        )
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        x = self.conv(hidden_states)
        if self.downsample:
            x = F.pixel_unshuffle(x, self.factor)
        if self.shortcut:
            y = F.pixel_unshuffle(hidden_states, self.factor)
            y = y.unflatten(1, (-1, self.group_size))
            y = y.mean(dim=2)
            hidden_states = x + y
        else:
            hidden_states = x
        return hidden_states
 class DCUpBlock2d(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        interpolate: bool = False,
        shortcut: bool = True,
        interpolation_mode: str = "nearest",
    ) -> None:
        super().__init__()
        self.interpolate = interpolate
        self.interpolation_mode = interpolation_mode
        self.shortcut = shortcut
        self.factor = 2
        self.repeats = out_channels * self.factor**2 // in_channels
        out_ratio = self.factor**2
        if not interpolate:
            out_channels = out_channels * out_ratio
        self.conv = ops.Conv2d(in_channels, out_channels, 3, 1, 1)
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        if self.interpolate:
            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
            x = self.conv(x)
        else:
            x = self.conv(hidden_states)
            x = F.pixel_shuffle(x, self.factor)
        if self.shortcut:
            y = hidden_states.repeat_interleave(self.repeats, dim=1, output_size=hidden_states.shape[1] * self.repeats)
            y = F.pixel_shuffle(y, self.factor)
            hidden_states = x + y
        else:
            hidden_states = x
        return hidden_states
 class Encoder(nn.Module):
    def __init__(
        self,
        in_channels: int,
        latent_channels: int,
        attention_head_dim: int = 32,
        block_type: str or tuple = "ResBlock",
        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
        downsample_block_type: str = "pixel_unshuffle",
        out_shortcut: bool = True,
    ):
        super().__init__()
        num_blocks = len(block_out_channels)
        if isinstance(block_type, str):
            block_type = (block_type,) * num_blocks
        if layers_per_block[0] > 0:
            self.conv_in = ops.Conv2d(
                in_channels,
                block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
                kernel_size=3,
                stride=1,
                padding=1,
            )
        else:
            self.conv_in = DCDownBlock2d(
                in_channels=in_channels,
                out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
                downsample=downsample_block_type == "pixel_unshuffle",
                shortcut=False,
            )
        down_blocks = []
        for i, (out_channel, num_layers) in enumerate(zip(block_out_channels, layers_per_block)):
            down_block_list = []
            for _ in range(num_layers):
                block = get_block(
                    block_type[i],
                    out_channel,
                    out_channel,
                    attention_head_dim=attention_head_dim,
                    norm_type="rms_norm",
                    act_fn="silu",
                    qkv_mutliscales=qkv_multiscales[i],
                )
                down_block_list.append(block)
            if i < num_blocks - 1 and num_layers > 0:
                downsample_block = DCDownBlock2d(
                    in_channels=out_channel,
                    out_channels=block_out_channels[i + 1],
                    downsample=downsample_block_type == "pixel_unshuffle",
                    shortcut=True,
                )
                down_block_list.append(downsample_block)
            down_blocks.append(nn.Sequential(*down_block_list))
        self.down_blocks = nn.ModuleList(down_blocks)
        self.conv_out = ops.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
        self.out_shortcut = out_shortcut
        if out_shortcut:
            self.out_shortcut_average_group_size = block_out_channels[-1] // latent_channels
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.conv_in(hidden_states)
        for down_block in self.down_blocks:
            hidden_states = down_block(hidden_states)
        if self.out_shortcut:
            x = hidden_states.unflatten(1, (-1, self.out_shortcut_average_group_size))
            x = x.mean(dim=2)
            hidden_states = self.conv_out(hidden_states) + x
        else:
            hidden_states = self.conv_out(hidden_states)
        return hidden_states
 class Decoder(nn.Module):
    def __init__(
        self,
        in_channels: int,
        latent_channels: int,
        attention_head_dim: int = 32,
        block_type: str or tuple = "ResBlock",
        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
        norm_type: str or tuple = "rms_norm",
        act_fn: str or tuple = "silu",
        upsample_block_type: str = "pixel_shuffle",
        in_shortcut: bool = True,
    ):
        super().__init__()
        num_blocks = len(block_out_channels)
        if isinstance(block_type, str):
            block_type = (block_type,) * num_blocks
        if isinstance(norm_type, str):
            norm_type = (norm_type,) * num_blocks
        if isinstance(act_fn, str):
            act_fn = (act_fn,) * num_blocks
        self.conv_in = ops.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
        self.in_shortcut = in_shortcut
        if in_shortcut:
            self.in_shortcut_repeats = block_out_channels[-1] // latent_channels
        up_blocks = []
        for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
            up_block_list = []
            if i < num_blocks - 1 and num_layers > 0:
                upsample_block = DCUpBlock2d(
                    block_out_channels[i + 1],
                    out_channel,
                    interpolate=upsample_block_type == "interpolate",
                    shortcut=True,
                )
                up_block_list.append(upsample_block)
            for _ in range(num_layers):
                block = get_block(
                    block_type[i],
                    out_channel,
                    out_channel,
                    attention_head_dim=attention_head_dim,
                    norm_type=norm_type[i],
                    act_fn=act_fn[i],
                    qkv_mutliscales=qkv_multiscales[i],
                )
                up_block_list.append(block)
            up_blocks.insert(0, nn.Sequential(*up_block_list))
        self.up_blocks = nn.ModuleList(up_blocks)
        channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
        self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
        self.conv_act = nn.ReLU()
        self.conv_out = None
        if layers_per_block[0] > 0:
            self.conv_out = ops.Conv2d(channels, in_channels, 3, 1, 1)
        else:
            self.conv_out = DCUpBlock2d(
                channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
            )
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        if self.in_shortcut:
            x = hidden_states.repeat_interleave(
                self.in_shortcut_repeats, dim=1, output_size=hidden_states.shape[1] * self.in_shortcut_repeats
            )
            hidden_states = self.conv_in(hidden_states) + x
        else:
            hidden_states = self.conv_in(hidden_states)
        for up_block in reversed(self.up_blocks):
            hidden_states = up_block(hidden_states)
        hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
        hidden_states = self.conv_act(hidden_states)
        hidden_states = self.conv_out(hidden_states)
        return hidden_states
 class AutoencoderDC(nn.Module):
    def __init__(
        self,
        in_channels: int = 2,
        latent_channels: int = 8,
        attention_head_dim: int = 32,
        encoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
        decoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
        encoder_layers_per_block: Tuple[int] = (2, 2, 3, 3),
        decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3),
        encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
        decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
        upsample_block_type: str = "interpolate",
        downsample_block_type: str = "Conv",
        decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
        decoder_act_fns: Union[str, Tuple[str]] = "silu",
        scaling_factor: float = 0.41407,
    ) -> None:
        super().__init__()
        self.encoder = Encoder(
            in_channels=in_channels,
            latent_channels=latent_channels,
            attention_head_dim=attention_head_dim,
            block_type=encoder_block_types,
            block_out_channels=encoder_block_out_channels,
            layers_per_block=encoder_layers_per_block,
            qkv_multiscales=encoder_qkv_multiscales,
            downsample_block_type=downsample_block_type,
        )
        self.decoder = Decoder(
            in_channels=in_channels,
            latent_channels=latent_channels,
            attention_head_dim=attention_head_dim,
            block_type=decoder_block_types,
            block_out_channels=decoder_block_out_channels,
            layers_per_block=decoder_layers_per_block,
            qkv_multiscales=decoder_qkv_multiscales,
            norm_type=decoder_norm_types,
            act_fn=decoder_act_fns,
            upsample_block_type=upsample_block_type,
        )
        self.scaling_factor = scaling_factor
        self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
    def encode(self, x: torch.Tensor) -> torch.Tensor:
        """Internal encoding function."""
        encoded = self.encoder(x)
        return encoded * self.scaling_factor
    def decode(self, z: torch.Tensor) -> torch.Tensor:
        # Scale the latents back
        z = z / self.scaling_factor
        decoded = self.decoder(z)
        return decoded
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = self.encode(x)
        return self.decode(z)
--- a/comfy/ldm/ace/vae/music_dcae_pipeline.py
+++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py
@@ -1,109 +0,0 @@
 # Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_dcae_pipeline.py
 import torch
 from .autoencoder_dc import AutoencoderDC
 import logging
 try:
    import torchaudio
 except:
    logging.warning("torchaudio missing, ACE model will be broken")
 import torchvision.transforms as transforms
 from .music_vocoder import ADaMoSHiFiGANV1
 class MusicDCAE(torch.nn.Module):
    def __init__(self, source_sample_rate=None, dcae_config={}, vocoder_config={}):
        super(MusicDCAE, self).__init__()
        self.dcae = AutoencoderDC(**dcae_config)
        self.vocoder = ADaMoSHiFiGANV1(**vocoder_config)
        if source_sample_rate is None:
            self.source_sample_rate = 48000
        else:
            self.source_sample_rate = source_sample_rate
        # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
        self.transform = transforms.Compose([
            transforms.Normalize(0.5, 0.5),
        ])
        self.min_mel_value = -11.0
        self.max_mel_value = 3.0
        self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
        self.mel_chunk_size = 1024
        self.time_dimention_multiple = 8
        self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple
        self.scale_factor = 0.1786
        self.shift_factor = -1.9091
    def load_audio(self, audio_path):
        audio, sr = torchaudio.load(audio_path)
        return audio, sr
    def forward_mel(self, audios):
        mels = []
        for i in range(len(audios)):
            image = self.vocoder.mel_transform(audios[i])
            mels.append(image)
        mels = torch.stack(mels)
        return mels
    @torch.no_grad()
    def encode(self, audios, audio_lengths=None, sr=None):
        if audio_lengths is None:
            audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
            audio_lengths = audio_lengths.to(audios.device)
        if sr is None:
            sr = self.source_sample_rate
        if sr != 44100:
            audios = torchaudio.functional.resample(audios, sr, 44100)
        max_audio_len = audios.shape[-1]
        if max_audio_len % (8 * 512) != 0:
            audios = torch.nn.functional.pad(audios, (0, 8 * 512 - max_audio_len % (8 * 512)))
        mels = self.forward_mel(audios)
        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
        mels = self.transform(mels)
        latents = []
        for mel in mels:
            latent = self.dcae.encoder(mel.unsqueeze(0))
            latents.append(latent)
        latents = torch.cat(latents, dim=0)
        # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
        latents = (latents - self.shift_factor) * self.scale_factor
        return latents
        # return latents, latent_lengths
    @torch.no_grad()
    def decode(self, latents, audio_lengths=None, sr=None):
        latents = latents / self.scale_factor + self.shift_factor
        pred_wavs = []
        for latent in latents:
            mels = self.dcae.decoder(latent.unsqueeze(0))
            mels = mels * 0.5 + 0.5
            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
            wav = self.vocoder.decode(mels[0]).squeeze(1)
            if sr is not None:
                # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
                wav = torchaudio.functional.resample(wav, 44100, sr)
                # wav = resampler(wav)
            else:
                sr = 44100
            pred_wavs.append(wav)
        if audio_lengths is not None:
            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
        return torch.stack(pred_wavs)
        # return sr, pred_wavs
    def forward(self, audios, audio_lengths=None, sr=None):
        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
        sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr)
        return sr, pred_wavs, latents, latent_lengths
--- a/comfy/ldm/ace/vae/music_log_mel.py
+++ b/comfy/ldm/ace/vae/music_log_mel.py
@@ -1,113 +0,0 @@
 # Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_log_mel.py
 import torch
 import torch.nn as nn
 from torch import Tensor
 import logging
 try:
    from torchaudio.transforms import MelScale
 except:
    logging.warning("torchaudio missing, ACE model will be broken")
 import comfy.model_management
 class LinearSpectrogram(nn.Module):
    def __init__(
        self,
        n_fft=2048,
        win_length=2048,
        hop_length=512,
        center=False,
        mode="pow2_sqrt",
    ):
        super().__init__()
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length
        self.center = center
        self.mode = mode
        self.register_buffer("window", torch.hann_window(win_length))
    def forward(self, y: Tensor) -> Tensor:
        if y.ndim == 3:
            y = y.squeeze(1)
        y = torch.nn.functional.pad(
            y.unsqueeze(1),
            (
                (self.win_length - self.hop_length) // 2,
                (self.win_length - self.hop_length + 1) // 2,
            ),
            mode="reflect",
        ).squeeze(1)
        dtype = y.dtype
        spec = torch.stft(
            y.float(),
            self.n_fft,
            hop_length=self.hop_length,
            win_length=self.win_length,
            window=comfy.model_management.cast_to(self.window, dtype=torch.float32, device=y.device),
            center=self.center,
            pad_mode="reflect",
            normalized=False,
            onesided=True,
            return_complex=True,
        )
        spec = torch.view_as_real(spec)
        if self.mode == "pow2_sqrt":
            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
        spec = spec.to(dtype)
        return spec
 class LogMelSpectrogram(nn.Module):
    def __init__(
        self,
        sample_rate=44100,
        n_fft=2048,
        win_length=2048,
        hop_length=512,
        n_mels=128,
        center=False,
        f_min=0.0,
        f_max=None,
    ):
        super().__init__()
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length
        self.center = center
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max or sample_rate // 2
        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
        self.mel_scale = MelScale(
            self.n_mels,
            self.sample_rate,
            self.f_min,
            self.f_max,
            self.n_fft // 2 + 1,
            "slaney",
            "slaney",
        )
    def compress(self, x: Tensor) -> Tensor:
        return torch.log(torch.clamp(x, min=1e-5))
    def decompress(self, x: Tensor) -> Tensor:
        return torch.exp(x)
    def forward(self, x: Tensor, return_linear: bool = False) -> Tensor:
        linear = self.spectrogram(x)
        x = self.mel_scale(linear)
        x = self.compress(x)
        # print(x.shape)
        if return_linear:
            return x, self.compress(linear)
        return x
--- a/comfy/ldm/ace/vae/music_vocoder.py
+++ b/comfy/ldm/ace/vae/music_vocoder.py
@@ -1,538 +0,0 @@
 # Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_vocoder.py
 import torch
 from torch import nn
 from functools import partial
 from math import prod
 from typing import Callable, Tuple, List
 import numpy as np
 import torch.nn.functional as F
 from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
 from .music_log_mel import LogMelSpectrogram
 import comfy.model_management
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 def drop_path(
    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
 ):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.
    """  # noqa: E501
    if drop_prob == 0.0 or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (
        x.ndim - 1
    )  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
    if keep_prob > 0.0 and scale_by_keep:
        random_tensor.div_(keep_prob)
    return x * random_tensor
 class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep
    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
    def extra_repr(self):
        return f"drop_prob={round(self.drop_prob,3):0.3f}"
 class LayerNorm(nn.Module):
    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
    with shape (batch_size, channels, height, width).
    """  # noqa: E501
    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ["channels_last", "channels_first"]:
            raise NotImplementedError
        self.normalized_shape = (normalized_shape,)
    def forward(self, x):
        if self.data_format == "channels_last":
            return F.layer_norm(
                x, self.normalized_shape, comfy.model_management.cast_to(self.weight, dtype=x.dtype, device=x.device), comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device), self.eps
            )
        elif self.data_format == "channels_first":
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.eps)
            x = comfy.model_management.cast_to(self.weight[:, None], dtype=x.dtype, device=x.device) * x + comfy.model_management.cast_to(self.bias[:, None], dtype=x.dtype, device=x.device)
            return x
 class ConvNeXtBlock(nn.Module):
    r"""ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in PyTorch
    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
        kernel_size (int): Kernel size for depthwise conv. Default: 7.
        dilation (int): Dilation for depthwise conv. Default: 1.
    """  # noqa: E501
    def __init__(
        self,
        dim: int,
        drop_path: float = 0.0,
        layer_scale_init_value: float = 1e-6,
        mlp_ratio: float = 4.0,
        kernel_size: int = 7,
        dilation: int = 1,
    ):
        super().__init__()
        self.dwconv = ops.Conv1d(
            dim,
            dim,
            kernel_size=kernel_size,
            padding=int(dilation * (kernel_size - 1) / 2),
            groups=dim,
        )  # depthwise conv
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = ops.Linear(
            dim, int(mlp_ratio * dim)
        )  # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.pwconv2 = ops.Linear(int(mlp_ratio * dim), dim)
        self.gamma = (
            nn.Parameter(torch.empty((dim)), requires_grad=False)
            if layer_scale_init_value > 0
            else None
        )
        self.drop_path = DropPath(
            drop_path) if drop_path > 0.0 else nn.Identity()
    def forward(self, x, apply_residual: bool = True):
        input = x
        x = self.dwconv(x)
        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)
        if self.gamma is not None:
            x = comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device) * x
        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
        x = self.drop_path(x)
        if apply_residual:
            x = input + x
        return x
 class ParallelConvNeXtBlock(nn.Module):
    def __init__(self, kernel_sizes: List[int], *args, **kwargs):
        super().__init__()
        self.blocks = nn.ModuleList(
            [
                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
                for kernel_size in kernel_sizes
            ]
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.stack(
            [block(x, apply_residual=False) for block in self.blocks] + [x],
            dim=1,
        ).sum(dim=1)
 class ConvNeXtEncoder(nn.Module):
    def __init__(
        self,
        input_channels=3,
        depths=[3, 3, 9, 3],
        dims=[96, 192, 384, 768],
        drop_path_rate=0.0,
        layer_scale_init_value=1e-6,
        kernel_sizes: Tuple[int] = (7,),
    ):
        super().__init__()
        assert len(depths) == len(dims)
        self.channel_layers = nn.ModuleList()
        stem = nn.Sequential(
            ops.Conv1d(
                input_channels,
                dims[0],
                kernel_size=7,
                padding=3,
                padding_mode="replicate",
            ),
            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
        )
        self.channel_layers.append(stem)
        for i in range(len(depths) - 1):
            mid_layer = nn.Sequential(
                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
                ops.Conv1d(dims[i], dims[i + 1], kernel_size=1),
            )
            self.channel_layers.append(mid_layer)
        block_fn = (
            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
            if len(kernel_sizes) == 1
            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
        )
        self.stages = nn.ModuleList()
        drop_path_rates = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
        ]
        cur = 0
        for i in range(len(depths)):
            stage = nn.Sequential(
                *[
                    block_fn(
                        dim=dims[i],
                        drop_path=drop_path_rates[cur + j],
                        layer_scale_init_value=layer_scale_init_value,
                    )
                    for j in range(depths[i])
                ]
            )
            self.stages.append(stage)
            cur += depths[i]
        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
    def forward(
        self,
        x: torch.Tensor,
    ) -> torch.Tensor:
        for channel_layer, stage in zip(self.channel_layers, self.stages):
            x = channel_layer(x)
            x = stage(x)
        return self.norm(x)
 def get_padding(kernel_size, dilation=1):
    return (kernel_size * dilation - dilation) // 2
 class ResBlock1(torch.nn.Module):
    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
        super().__init__()
        self.convs1 = nn.ModuleList(
            [
                torch.nn.utils.parametrizations.weight_norm(
                    ops.Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=dilation[0],
                        padding=get_padding(kernel_size, dilation[0]),
                    )
                ),
                torch.nn.utils.parametrizations.weight_norm(
                    ops.Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=dilation[1],
                        padding=get_padding(kernel_size, dilation[1]),
                    )
                ),
                torch.nn.utils.parametrizations.weight_norm(
                    ops.Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=dilation[2],
                        padding=get_padding(kernel_size, dilation[2]),
                    )
                ),
            ]
        )
        self.convs2 = nn.ModuleList(
            [
                torch.nn.utils.parametrizations.weight_norm(
                    ops.Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=1,
                        padding=get_padding(kernel_size, 1),
                    )
                ),
                torch.nn.utils.parametrizations.weight_norm(
                    ops.Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=1,
                        padding=get_padding(kernel_size, 1),
                    )
                ),
                torch.nn.utils.parametrizations.weight_norm(
                    ops.Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=1,
                        padding=get_padding(kernel_size, 1),
                    )
                ),
            ]
        )
    def forward(self, x):
        for c1, c2 in zip(self.convs1, self.convs2):
            xt = F.silu(x)
            xt = c1(xt)
            xt = F.silu(xt)
            xt = c2(xt)
            x = xt + x
        return x
    def remove_weight_norm(self):
        for conv in self.convs1:
            remove_weight_norm(conv)
        for conv in self.convs2:
            remove_weight_norm(conv)
 class HiFiGANGenerator(nn.Module):
    def __init__(
        self,
        *,
        hop_length: int = 512,
        upsample_rates: Tuple[int] = (8, 8, 2, 2, 2),
        upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2),
        resblock_kernel_sizes: Tuple[int] = (3, 7, 11),
        resblock_dilation_sizes: Tuple[Tuple[int]] = (
            (1, 3, 5), (1, 3, 5), (1, 3, 5)),
        num_mels: int = 128,
        upsample_initial_channel: int = 512,
        use_template: bool = True,
        pre_conv_kernel_size: int = 7,
        post_conv_kernel_size: int = 7,
        post_activation: Callable = partial(nn.SiLU, inplace=True),
    ):
        super().__init__()
        assert (
            prod(upsample_rates) == hop_length
        ), f"hop_length must be {prod(upsample_rates)}"
        self.conv_pre = torch.nn.utils.parametrizations.weight_norm(
            ops.Conv1d(
                num_mels,
                upsample_initial_channel,
                pre_conv_kernel_size,
                1,
                padding=get_padding(pre_conv_kernel_size),
            )
        )
        self.num_upsamples = len(upsample_rates)
        self.num_kernels = len(resblock_kernel_sizes)
        self.noise_convs = nn.ModuleList()
        self.use_template = use_template
        self.ups = nn.ModuleList()
        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
            c_cur = upsample_initial_channel // (2 ** (i + 1))
            self.ups.append(
                torch.nn.utils.parametrizations.weight_norm(
                    ops.ConvTranspose1d(
                        upsample_initial_channel // (2**i),
                        upsample_initial_channel // (2 ** (i + 1)),
                        k,
                        u,
                        padding=(k - u) // 2,
                    )
                )
            )
            if not use_template:
                continue
            if i + 1 < len(upsample_rates):
                stride_f0 = np.prod(upsample_rates[i + 1:])
                self.noise_convs.append(
                    ops.Conv1d(
                        1,
                        c_cur,
                        kernel_size=stride_f0 * 2,
                        stride=stride_f0,
                        padding=stride_f0 // 2,
                    )
                )
            else:
                self.noise_convs.append(ops.Conv1d(1, c_cur, kernel_size=1))
        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = upsample_initial_channel // (2 ** (i + 1))
            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
                self.resblocks.append(ResBlock1(ch, k, d))
        self.activation_post = post_activation()
        self.conv_post = torch.nn.utils.parametrizations.weight_norm(
            ops.Conv1d(
                ch,
                1,
                post_conv_kernel_size,
                1,
                padding=get_padding(post_conv_kernel_size),
            )
        )
    def forward(self, x, template=None):
        x = self.conv_pre(x)
        for i in range(self.num_upsamples):
            x = F.silu(x, inplace=True)
            x = self.ups[i](x)
            if self.use_template:
                x = x + self.noise_convs[i](template)
            xs = None
            for j in range(self.num_kernels):
                if xs is None:
                    xs = self.resblocks[i * self.num_kernels + j](x)
                else:
                    xs += self.resblocks[i * self.num_kernels + j](x)
            x = xs / self.num_kernels
        x = self.activation_post(x)
        x = self.conv_post(x)
        x = torch.tanh(x)
        return x
    def remove_weight_norm(self):
        for up in self.ups:
            remove_weight_norm(up)
        for block in self.resblocks:
            block.remove_weight_norm()
        remove_weight_norm(self.conv_pre)
        remove_weight_norm(self.conv_post)
 class ADaMoSHiFiGANV1(nn.Module):
    def __init__(
        self,
        input_channels: int = 128,
        depths: List[int] = [3, 3, 9, 3],
        dims: List[int] = [128, 256, 384, 512],
        drop_path_rate: float = 0.0,
        kernel_sizes: Tuple[int] = (7,),
        upsample_rates: Tuple[int] = (4, 4, 2, 2, 2, 2, 2),
        upsample_kernel_sizes: Tuple[int] = (8, 8, 4, 4, 4, 4, 4),
        resblock_kernel_sizes: Tuple[int] = (3, 7, 11, 13),
        resblock_dilation_sizes: Tuple[Tuple[int]] = (
            (1, 3, 5), (1, 3, 5), (1, 3, 5), (1, 3, 5)),
        num_mels: int = 512,
        upsample_initial_channel: int = 1024,
        use_template: bool = False,
        pre_conv_kernel_size: int = 13,
        post_conv_kernel_size: int = 13,
        sampling_rate: int = 44100,
        n_fft: int = 2048,
        win_length: int = 2048,
        hop_length: int = 512,
        f_min: int = 40,
        f_max: int = 16000,
        n_mels: int = 128,
    ):
        super().__init__()
        self.backbone = ConvNeXtEncoder(
            input_channels=input_channels,
            depths=depths,
            dims=dims,
            drop_path_rate=drop_path_rate,
            kernel_sizes=kernel_sizes,
        )
        self.head = HiFiGANGenerator(
            hop_length=hop_length,
            upsample_rates=upsample_rates,
            upsample_kernel_sizes=upsample_kernel_sizes,
            resblock_kernel_sizes=resblock_kernel_sizes,
            resblock_dilation_sizes=resblock_dilation_sizes,
            num_mels=num_mels,
            upsample_initial_channel=upsample_initial_channel,
            use_template=use_template,
            pre_conv_kernel_size=pre_conv_kernel_size,
            post_conv_kernel_size=post_conv_kernel_size,
        )
        self.sampling_rate = sampling_rate
        self.mel_transform = LogMelSpectrogram(
            sample_rate=sampling_rate,
            n_fft=n_fft,
            win_length=win_length,
            hop_length=hop_length,
            f_min=f_min,
            f_max=f_max,
            n_mels=n_mels,
        )
        self.eval()
    @torch.no_grad()
    def decode(self, mel):
        y = self.backbone(mel)
        y = self.head(y)
        return y
    @torch.no_grad()
    def encode(self, x):
        return self.mel_transform(x)
    def forward(self, mel):
        y = self.backbone(mel)
        y = self.head(y)
        return y
--- a/comfy/ldm/audio/autoencoder.py
+++ b/comfy/ldm/audio/autoencoder.py
@@ -75,10 +75,16 @@ class SnakeBeta(nn.Module):
        return x
 def WNConv1d(*args, **kwargs):
-    return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
+    try:
        return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
    except:
        return torch.nn.utils.weight_norm(ops.Conv1d(*args, **kwargs)) #support pytorch 2.1 and older
 def WNConvTranspose1d(*args, **kwargs):
-    return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
+    try:
        return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
    except:
        return torch.nn.utils.weight_norm(ops.ConvTranspose1d(*args, **kwargs)) #support pytorch 2.1 and older
 def get_activation(activation: Literal["elu", "snake", "none"], antialias=False, channels=None) -> nn.Module:
    if activation == "elu":
--- a/comfy/ldm/cascade/stage_a.py
+++ b/comfy/ldm/cascade/stage_a.py
@@ -19,10 +19,6 @@
 import torch
 from torch import nn
 from torch.autograd import Function
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 class vector_quantize(Function):
    @staticmethod
@@ -125,15 +121,15 @@ class ResBlock(nn.Module):
        self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.depthwise = nn.Sequential(
            nn.ReplicationPad2d(1),
-            ops.Conv2d(c, c, kernel_size=3, groups=c)
+            nn.Conv2d(c, c, kernel_size=3, groups=c)
        )
        # channelwise
        self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.channelwise = nn.Sequential(
-            ops.Linear(c, c_hidden),
+            nn.Linear(c, c_hidden),
            nn.GELU(),
-            ops.Linear(c_hidden, c),
+            nn.Linear(c_hidden, c),
        )
        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
@@ -175,16 +171,16 @@ class StageA(nn.Module):
        # Encoder blocks
        self.in_block = nn.Sequential(
            nn.PixelUnshuffle(2),
-            ops.Conv2d(3 * 4, c_levels[0], kernel_size=1)
+            nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
        )
        down_blocks = []
        for i in range(levels):
            if i > 0:
-                down_blocks.append(ops.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
            block = ResBlock(c_levels[i], c_levels[i] * 4)
            down_blocks.append(block)
        down_blocks.append(nn.Sequential(
-            ops.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
        ))
        self.down_blocks = nn.Sequential(*down_blocks)
@@ -195,7 +191,7 @@ class StageA(nn.Module):
        # Decoder blocks
        up_blocks = [nn.Sequential(
-            ops.Conv2d(c_latent, c_levels[-1], kernel_size=1)
+            nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
        )]
        for i in range(levels):
            for j in range(bottleneck_blocks if i == 0 else 1):
@@ -203,11 +199,11 @@ class StageA(nn.Module):
                up_blocks.append(block)
            if i < levels - 1:
                up_blocks.append(
-                    ops.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
+                    nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
                                       padding=1))
        self.up_blocks = nn.Sequential(*up_blocks)
        self.out_block = nn.Sequential(
-            ops.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
+            nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
            nn.PixelShuffle(2),
        )
@@ -236,17 +232,17 @@ class Discriminator(nn.Module):
        super().__init__()
        d = max(depth - 3, 3)
        layers = [
-            nn.utils.spectral_norm(ops.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
+            nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
        ]
        for i in range(depth - 1):
            c_in = c_hidden // (2 ** max((d - i), 0))
            c_out = c_hidden // (2 ** max((d - 1 - i), 0))
-            layers.append(nn.utils.spectral_norm(ops.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
            layers.append(nn.InstanceNorm2d(c_out))
            layers.append(nn.LeakyReLU(0.2))
        self.encoder = nn.Sequential(*layers)
-        self.shuffle = ops.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
+        self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
        self.logits = nn.Sigmoid()
    def forward(self, x, cond=None):
--- a/comfy/ldm/cascade/stage_c_coder.py
+++ b/comfy/ldm/cascade/stage_c_coder.py
@@ -19,9 +19,6 @@ import torch
 import torchvision
 from torch import nn
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 # EfficientNet
 class EfficientNetEncoder(nn.Module):
@@ -29,7 +26,7 @@ class EfficientNetEncoder(nn.Module):
        super().__init__()
        self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
        self.mapper = nn.Sequential(
-            ops.Conv2d(1280, c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(1280, c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent, affine=False),  # then normalize them to have mean 0 and std 1
        )
        self.mean = nn.Parameter(torch.tensor([0.485, 0.456, 0.406]))
@@ -37,7 +34,7 @@ class EfficientNetEncoder(nn.Module):
    def forward(self, x):
        x = x * 0.5 + 0.5
-        x = (x - self.mean.view([3,1,1]).to(device=x.device, dtype=x.dtype)) / self.std.view([3,1,1]).to(device=x.device, dtype=x.dtype)
+        x = (x - self.mean.view([3,1,1])) / self.std.view([3,1,1])
        o = self.mapper(self.backbone(x))
        return o
@@ -47,39 +44,39 @@ class Previewer(nn.Module):
    def __init__(self, c_in=16, c_hidden=512, c_out=3):
        super().__init__()
        self.blocks = nn.Sequential(
-            ops.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
+            nn.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),
-            ops.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),
-            ops.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
+            nn.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),
-            ops.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),
-            ops.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
+            nn.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
+            nn.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.Conv2d(c_hidden // 4, c_out, kernel_size=1),
+            nn.Conv2d(c_hidden // 4, c_out, kernel_size=1),
        )
    def forward(self, x):
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@@ -1,181 +0,0 @@
 import torch
 from torch import Tensor, nn
 from comfy.ldm.flux.math import attention
 from comfy.ldm.flux.layers import (
    MLPEmbedder,
    RMSNorm,
    QKNorm,
    SelfAttention,
    ModulationOut,
 )
 class ChromaModulationOut(ModulationOut):
    @classmethod
    def from_offset(cls, tensor: torch.Tensor, offset: int = 0) -> ModulationOut:
        return cls(
            shift=tensor[:, offset : offset + 1, :],
            scale=tensor[:, offset + 1 : offset + 2, :],
            gate=tensor[:, offset + 2 : offset + 3, :],
        )
 class Approximator(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers = 5, dtype=None, device=None, operations=None):
        super().__init__()
        self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
        self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
        self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
        self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device)
    @property
    def device(self):
        # Get the device of the module (assumes all parameters are on the same device)
        return next(self.parameters()).device
    def forward(self, x: Tensor) -> Tensor:
        x = self.in_proj(x)
        for layer, norms in zip(self.layers, self.norms):
            x = x + layer(norms(x))
        x = self.out_proj(x)
        return x
 class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.img_mlp = nn.Sequential(
            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.txt_mlp = nn.Sequential(
            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
        self.flipped_img_txt = flipped_img_txt
    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None):
        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
        # prepare image for attention
        img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
        # prepare txt for attention
        txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
        # run actual attention
        attn = attention(torch.cat((txt_q, img_q), dim=2),
                         torch.cat((txt_k, img_k), dim=2),
                         torch.cat((txt_v, img_v), dim=2),
                         pe=pe, mask=attn_mask)
        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
        # calculate the img bloks
        img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
        img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
        # calculate the txt bloks
        txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
        txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
        return img, txt
 class SingleStreamBlock(nn.Module):
    """
    A DiT block with parallel linear layers as described in
    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
    """
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qk_scale: float = None,
        dtype=None,
        device=None,
        operations=None
    ):
        super().__init__()
        self.hidden_dim = hidden_size
        self.num_heads = num_heads
        head_dim = hidden_size // num_heads
        self.scale = qk_scale or head_dim**-0.5
        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
        # qkv and mlp_in
        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
        # proj and mlp_out
        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
        self.hidden_size = hidden_size
        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.mlp_act = nn.GELU(approximate="tanh")
    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor:
        mod = vec
        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)
        # compute attention
        attn = attention(q, k, v, pe=pe, mask=attn_mask)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
        x.addcmul_(mod.gate, output)
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
 class LastLayer(nn.Module):
    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
        super().__init__()
        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
        shift, scale = vec
        shift = shift.squeeze(1)
        scale = scale.squeeze(1)
        x = torch.addcmul(shift[:, None, :], 1 + scale[:, None, :], self.norm_final(x))
        x = self.linear(x)
        return x
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@@ -1,271 +0,0 @@
 #Original code can be found on: https://github.com/black-forest-labs/flux
 from dataclasses import dataclass
 import torch
 from torch import Tensor, nn
 from einops import rearrange, repeat
 import comfy.ldm.common_dit
 from comfy.ldm.flux.layers import (
    EmbedND,
    timestep_embedding,
 )
 from .layers import (
    DoubleStreamBlock,
    LastLayer,
    SingleStreamBlock,
    Approximator,
    ChromaModulationOut,
 )
@dataclass
 class ChromaParams:
    in_channels: int
    out_channels: int
    context_in_dim: int
    hidden_size: int
    mlp_ratio: float
    num_heads: int
    depth: int
    depth_single_blocks: int
    axes_dim: list
    theta: int
    patch_size: int
    qkv_bias: bool
    in_dim: int
    out_dim: int
    hidden_dim: int
    n_layers: int
 class Chroma(nn.Module):
    """
    Transformer model for flow matching on sequences.
    """
    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.dtype = dtype
        params = ChromaParams(**kwargs)
        self.params = params
        self.patch_size = params.patch_size
        self.in_channels = params.in_channels
        self.out_channels = params.out_channels
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
            )
        pe_dim = params.hidden_size // params.num_heads
        if sum(params.axes_dim) != pe_dim:
            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
        self.hidden_size = params.hidden_size
        self.num_heads = params.num_heads
        self.in_dim = params.in_dim
        self.out_dim = params.out_dim
        self.hidden_dim = params.hidden_dim
        self.n_layers = params.n_layers
        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
        # set as nn identity for now, will overwrite it later.
        self.distilled_guidance_layer = Approximator(
                    in_dim=self.in_dim,
                    hidden_dim=self.hidden_dim,
                    out_dim=self.out_dim,
                    n_layers=self.n_layers,
                    dtype=dtype, device=device, operations=operations
                )
        self.double_blocks = nn.ModuleList(
            [
                DoubleStreamBlock(
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
            ]
        )
        self.single_blocks = nn.ModuleList(
            [
                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )
        if final_layer:
            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
        self.skip_mmdit = []
        self.skip_dit = []
        self.lite = False
    def get_modulations(self, tensor: torch.Tensor, block_type: str, *, idx: int = 0):
        # This function slices up the modulations tensor which has the following layout:
        #   single     : num_single_blocks * 3 elements
        #   double_img : num_double_blocks * 6 elements
        #   double_txt : num_double_blocks * 6 elements
        #   final      : 2 elements
        if block_type == "final":
            return (tensor[:, -2:-1, :], tensor[:, -1:, :])
        single_block_count = self.params.depth_single_blocks
        double_block_count = self.params.depth
        offset = 3 * idx
        if block_type == "single":
            return ChromaModulationOut.from_offset(tensor, offset)
        # Double block modulations are 6 elements so we double 3 * idx.
        offset *= 2
        if block_type in {"double_img", "double_txt"}:
            # Advance past the single block modulations.
            offset += 3 * single_block_count
            if block_type == "double_txt":
                # Advance past the double block img modulations.
                offset += 6 * double_block_count
            return (
                ChromaModulationOut.from_offset(tensor, offset),
                ChromaModulationOut.from_offset(tensor, offset + 3),
            )
        raise ValueError("Bad block_type")
    def forward_orig(
        self,
        img: Tensor,
        img_ids: Tensor,
        txt: Tensor,
        txt_ids: Tensor,
        timesteps: Tensor,
        guidance: Tensor = None,
        control = None,
        transformer_options={},
        attn_mask: Tensor = None,
    ) -> Tensor:
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")
        # running on sequences img
        img = self.img_in(img)
        # distilled vector guidance
        mod_index_length = 344
        distill_timestep = timestep_embedding(timesteps.detach().clone(), 16).to(img.device, img.dtype)
        # guidance = guidance *
        distil_guidance = timestep_embedding(guidance.detach().clone(), 16).to(img.device, img.dtype)
        # get all modulation index
        modulation_index = timestep_embedding(torch.arange(mod_index_length, device=img.device), 32).to(img.device, img.dtype)
        # we need to broadcast the modulation index here so each batch has all of the index
        modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1).to(img.device, img.dtype)
        # and we need to broadcast timestep and guidance along too
        timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1).to(img.dtype).to(img.device, img.dtype)
        # then and only then we could concatenate it together
        input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1).to(img.device, img.dtype)
        mod_vectors = self.distilled_guidance_layer(input_vec)
        txt = self.txt_in(txt)
        ids = torch.cat((txt_ids, img_ids), dim=1)
        pe = self.pe_embedder(ids)
        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.double_blocks):
            if i not in self.skip_mmdit:
                double_mod = (
                    self.get_modulations(mod_vectors, "double_img", idx=i),
                    self.get_modulations(mod_vectors, "double_txt", idx=i),
                )
                if ("double_block", i) in blocks_replace:
                    def block_wrap(args):
                        out = {}
                        out["img"], out["txt"] = block(img=args["img"],
                                                       txt=args["txt"],
                                                       vec=args["vec"],
                                                       pe=args["pe"],
                                                       attn_mask=args.get("attn_mask"))
                        return out
                    out = blocks_replace[("double_block", i)]({"img": img,
                                                               "txt": txt,
                                                               "vec": double_mod,
                                                               "pe": pe,
                                                               "attn_mask": attn_mask},
                                                              {"original_block": block_wrap})
                    txt = out["txt"]
                    img = out["img"]
                else:
                    img, txt = block(img=img,
                                     txt=txt,
                                     vec=double_mod,
                                     pe=pe,
                                     attn_mask=attn_mask)
                if control is not None: # Controlnet
                    control_i = control.get("input")
                    if i < len(control_i):
                        add = control_i[i]
                        if add is not None:
                            img += add
        img = torch.cat((txt, img), 1)
        for i, block in enumerate(self.single_blocks):
            if i not in self.skip_dit:
                single_mod = self.get_modulations(mod_vectors, "single", idx=i)
                if ("single_block", i) in blocks_replace:
                    def block_wrap(args):
                        out = {}
                        out["img"] = block(args["img"],
                                           vec=args["vec"],
                                           pe=args["pe"],
                                           attn_mask=args.get("attn_mask"))
                        return out
                    out = blocks_replace[("single_block", i)]({"img": img,
                                                               "vec": single_mod,
                                                               "pe": pe,
                                                               "attn_mask": attn_mask},
                                                              {"original_block": block_wrap})
                    img = out["img"]
                else:
                    img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask)
                if control is not None: # Controlnet
                    control_o = control.get("output")
                    if i < len(control_o):
                        add = control_o[i]
                        if add is not None:
                            img[:, txt.shape[1] :, ...] += add
        img = img[:, txt.shape[1] :, ...]
        final_mod = self.get_modulations(mod_vectors, "final")
        img = self.final_layer(img, vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
        return img
    def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
        bs, c, h, w = x.shape
        patch_size = 2
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
        h_len = ((h + (patch_size // 2)) // patch_size)
        w_len = ((w + (patch_size // 2)) // patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@@ -1,6 +1,5 @@
 import torch
-import comfy.rmsnorm
+import comfy.ops
 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
    if padding_mode == "circular" and (torch.jit.is_tracing() or torch.jit.is_scripting()):
@@ -12,5 +11,20 @@ def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
    return torch.nn.functional.pad(img, pad, mode=padding_mode)
 try:
    rms_norm_torch = torch.nn.functional.rms_norm
 except:
    rms_norm_torch = None
-rms_norm = comfy.rmsnorm.rms_norm
+def rms_norm(x, weight=None, eps=1e-6):
    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
        if weight is None:
            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
        else:
            return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
    else:
        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
        if weight is None:
            return r
        else:
            return r * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@@ -23,6 +23,7 @@ from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
 from torch import nn
 from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
 from comfy.ldm.modules.attention import optimized_attention
@@ -36,11 +37,11 @@ def apply_rotary_pos_emb(
    return t_out
-def get_normalization(name: str, channels: int, weight_args={}, operations=None):
+def get_normalization(name: str, channels: int, weight_args={}):
    if name == "I":
        return nn.Identity()
    elif name == "R":
-        return operations.RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
+        return RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
    else:
        raise ValueError(f"Normalization {name} not found")
@@ -119,15 +120,15 @@ class Attention(nn.Module):
        self.to_q = nn.Sequential(
            operations.Linear(query_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[0], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[0], norm_dim),
        )
        self.to_k = nn.Sequential(
            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[1], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[1], norm_dim),
        )
        self.to_v = nn.Sequential(
            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[2], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[2], norm_dim),
        )
        self.to_out = nn.Sequential(
--- a/comfy/ldm/cosmos/model.py
+++ b/comfy/ldm/cosmos/model.py
@@ -27,6 +27,8 @@ from torchvision import transforms
 from enum import Enum
 import logging
 from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
 from .blocks import (
    FinalLayer,
    GeneralDITTransformerBlock,
@@ -193,7 +195,7 @@ class GeneralDIT(nn.Module):
        if self.affline_emb_norm:
            logging.debug("Building affine embedding normalization layer")
-            self.affline_norm = operations.RMSNorm(model_channels, elementwise_affine=True, eps=1e-6, device=device, dtype=dtype)
+            self.affline_norm = RMSNorm(model_channels, elementwise_affine=True, eps=1e-6)
        else:
            self.affline_norm = nn.Identity()
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -105,9 +105,7 @@ class Modulation(nn.Module):
        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
    def forward(self, vec: Tensor) -> tuple:
-        if vec.ndim == 2:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
            vec = vec[:, None, :]
        out = self.lin(nn.functional.silu(vec)).chunk(self.multiplier, dim=-1)
        return (
            ModulationOut(*out[:3]),
@@ -115,20 +113,6 @@ class Modulation(nn.Module):
        )
 def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
    if modulation_dims is None:
        if m_add is not None:
            return tensor * m_mult + m_add
        else:
            return tensor * m_mult
    else:
        for d in modulation_dims:
            tensor[:, d[0]:d[1]] *= m_mult[:, d[2]]
            if m_add is not None:
                tensor[:, d[0]:d[1]] += m_add[:, d[2]]
        return tensor
 class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()
@@ -159,20 +143,20 @@ class DoubleStreamBlock(nn.Module):
        )
        self.flipped_img_txt = flipped_img_txt
-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)
        # prepare image for attention
        img_modulated = self.img_norm1(img)
-        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
@@ -195,12 +179,12 @@ class DoubleStreamBlock(nn.Module):
            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
        # calculate the img bloks
-        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
        # calculate the txt bloks
-        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
+        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
+        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
@@ -244,9 +228,9 @@ class SingleStreamBlock(nn.Module):
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None) -> Tensor:
        mod, _ = self.modulation(vec)
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        qkv, mlp = torch.split(self.linear1((1 + mod.scale) * self.pre_norm(x) + mod.shift), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)
@@ -255,7 +239,7 @@ class SingleStreamBlock(nn.Module):
        attn = attention(q, k, v, pe=pe, mask=attn_mask)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += apply_mod(output, mod.gate, None, modulation_dims)
+        x += mod.gate * output
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
@@ -268,11 +252,8 @@ class LastLayer(nn.Module):
        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
-    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        if vec.ndim == 2:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-            vec = vec[:, None, :]
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=-1)
        x = apply_mod(self.norm_final(x), (1 + scale), shift, modulation_dims)
        x = self.linear(x)
        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -10,11 +10,10 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
    q_shape = q.shape
    k_shape = k.shape
-    if pe is not None:
+    q = q.float().reshape(*q.shape[:-1], -1, 1, 2)
-        q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
+    k = k.float().reshape(*k.shape[:-1], -1, 1, 2)
-        k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
+    q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
-        q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
+    k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
    heads = q.shape[1]
    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask)
@@ -37,8 +36,8 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2)
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -115,11 +115,8 @@ class Flux(nn.Module):
        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
        txt = self.txt_in(txt)
-        if img_ids is not None:
+        ids = torch.cat((txt_ids, img_ids), dim=1)
-            ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
            pe = self.pe_embedder(ids)
        else:
            pe = None
        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.double_blocks):
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@@ -13,6 +13,7 @@ from comfy.ldm.modules.attention import optimized_attention
 from .layers import (
    FeedForward,
    PatchEmbed,
    RMSNorm,
    TimestepEmbedder,
 )
@@ -89,10 +90,10 @@ class AsymmetricAttention(nn.Module):
        # Query and key normalization for stability.
        assert qk_norm
-        self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
+        self.q_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
-        self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
+        self.k_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
-        self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
+        self.q_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
-        self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
+        self.k_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
        # Output layers. y features go back down from dim_x -> dim_y.
        self.proj_x = operations.Linear(dim_x, dim_x, bias=out_bias, device=device, dtype=dtype)
--- a/comfy/ldm/genmo/joint_model/layers.py
+++ b/comfy/ldm/genmo/joint_model/layers.py
@@ -151,3 +151,14 @@ class PatchEmbed(nn.Module):
        x = self.norm(x)
        return x
 class RMSNorm(torch.nn.Module):
    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
        super().__init__()
        self.eps = eps
        self.weight = torch.nn.Parameter(torch.empty(hidden_size, device=device, dtype=dtype))
        self.register_parameter("bias", None)
    def forward(self, x):
        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@@ -1,802 +0,0 @@
 from typing import Optional, Tuple, List
 import torch
 import torch.nn as nn
 import einops
 from einops import repeat
 from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
 import torch.nn.functional as F
 from comfy.ldm.flux.math import apply_rope, rope
 from comfy.ldm.flux.layers import LastLayer
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
 import comfy.ldm.common_dit
 # Copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
 class EmbedND(nn.Module):
    def __init__(self, theta: int, axes_dim: List[int]):
        super().__init__()
        self.theta = theta
        self.axes_dim = axes_dim
    def forward(self, ids: torch.Tensor) -> torch.Tensor:
        n_axes = ids.shape[-1]
        emb = torch.cat(
            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
            dim=-3,
        )
        return emb.unsqueeze(2)
 class PatchEmbed(nn.Module):
    def __init__(
        self,
        patch_size=2,
        in_channels=4,
        out_channels=1024,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.patch_size = patch_size
        self.out_channels = out_channels
        self.proj = operations.Linear(in_channels * patch_size * patch_size, out_channels, bias=True, dtype=dtype, device=device)
    def forward(self, latent):
        latent = self.proj(latent)
        return latent
 class PooledEmbed(nn.Module):
    def __init__(self, text_emb_dim, hidden_size, dtype=None, device=None, operations=None):
        super().__init__()
        self.pooled_embedder = TimestepEmbedding(in_channels=text_emb_dim, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
    def forward(self, pooled_embed):
        return self.pooled_embedder(pooled_embed)
 class TimestepEmbed(nn.Module):
    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
        super().__init__()
        self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0)
        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
    def forward(self, timesteps, wdtype):
        t_emb = self.time_proj(timesteps).to(dtype=wdtype)
        t_emb = self.timestep_embedder(t_emb)
        return t_emb
 def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])
 class HiDreamAttnProcessor_flashattn:
    """Attention processor used typically in processing the SD3-like self-attention projections."""
    def __call__(
        self,
        attn,
        image_tokens: torch.FloatTensor,
        image_tokens_masks: Optional[torch.FloatTensor] = None,
        text_tokens: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
        *args,
        **kwargs,
    ) -> torch.FloatTensor:
        dtype = image_tokens.dtype
        batch_size = image_tokens.shape[0]
        query_i = attn.q_rms_norm(attn.to_q(image_tokens)).to(dtype=dtype)
        key_i = attn.k_rms_norm(attn.to_k(image_tokens)).to(dtype=dtype)
        value_i = attn.to_v(image_tokens)
        inner_dim = key_i.shape[-1]
        head_dim = inner_dim // attn.heads
        query_i = query_i.view(batch_size, -1, attn.heads, head_dim)
        key_i = key_i.view(batch_size, -1, attn.heads, head_dim)
        value_i = value_i.view(batch_size, -1, attn.heads, head_dim)
        if image_tokens_masks is not None:
            key_i = key_i * image_tokens_masks.view(batch_size, -1, 1, 1)
        if not attn.single:
            query_t = attn.q_rms_norm_t(attn.to_q_t(text_tokens)).to(dtype=dtype)
            key_t = attn.k_rms_norm_t(attn.to_k_t(text_tokens)).to(dtype=dtype)
            value_t = attn.to_v_t(text_tokens)
            query_t = query_t.view(batch_size, -1, attn.heads, head_dim)
            key_t = key_t.view(batch_size, -1, attn.heads, head_dim)
            value_t = value_t.view(batch_size, -1, attn.heads, head_dim)
            num_image_tokens = query_i.shape[1]
            num_text_tokens = query_t.shape[1]
            query = torch.cat([query_i, query_t], dim=1)
            key = torch.cat([key_i, key_t], dim=1)
            value = torch.cat([value_i, value_t], dim=1)
        else:
            query = query_i
            key = key_i
            value = value_i
        if query.shape[-1] == rope.shape[-3] * 2:
            query, key = apply_rope(query, key, rope)
        else:
            query_1, query_2 = query.chunk(2, dim=-1)
            key_1, key_2 = key.chunk(2, dim=-1)
            query_1, key_1 = apply_rope(query_1, key_1, rope)
            query = torch.cat([query_1, query_2], dim=-1)
            key = torch.cat([key_1, key_2], dim=-1)
        hidden_states = attention(query, key, value)
        if not attn.single:
            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
            hidden_states_i = attn.to_out(hidden_states_i)
            hidden_states_t = attn.to_out_t(hidden_states_t)
            return hidden_states_i, hidden_states_t
        else:
            hidden_states = attn.to_out(hidden_states)
            return hidden_states
 class HiDreamAttention(nn.Module):
    def __init__(
        self,
        query_dim: int,
        heads: int = 8,
        dim_head: int = 64,
        upcast_attention: bool = False,
        upcast_softmax: bool = False,
        scale_qk: bool = True,
        eps: float = 1e-5,
        processor = None,
        out_dim: int = None,
        single: bool = False,
        dtype=None, device=None, operations=None
    ):
        # super(Attention, self).__init__()
        super().__init__()
        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
        self.query_dim = query_dim
        self.upcast_attention = upcast_attention
        self.upcast_softmax = upcast_softmax
        self.out_dim = out_dim if out_dim is not None else query_dim
        self.scale_qk = scale_qk
        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
        self.heads = out_dim // dim_head if out_dim is not None else heads
        self.sliceable_head_dim = heads
        self.single = single
        linear_cls = operations.Linear
        self.linear_cls = linear_cls
        self.to_q = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
        self.to_k = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
        self.to_v = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
        self.to_out = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
        self.q_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
        self.k_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
        if not single:
            self.to_q_t = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
            self.to_k_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
            self.to_v_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
            self.to_out_t = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
            self.q_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
            self.k_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
        self.processor = processor
    def forward(
        self,
        norm_image_tokens: torch.FloatTensor,
        image_tokens_masks: torch.FloatTensor = None,
        norm_text_tokens: torch.FloatTensor = None,
        rope: torch.FloatTensor = None,
    ) -> torch.Tensor:
        return self.processor(
            self,
            image_tokens = norm_image_tokens,
            image_tokens_masks = image_tokens_masks,
            text_tokens = norm_text_tokens,
            rope = rope,
        )
 class FeedForwardSwiGLU(nn.Module):
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        multiple_of: int = 256,
        ffn_dim_multiplier: Optional[float] = None,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        # custom dim factor multiplier
        if ffn_dim_multiplier is not None:
            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
        hidden_dim = multiple_of * (
            (hidden_dim + multiple_of - 1) // multiple_of
        )
        self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
        self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
        self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
    def forward(self, x):
        return self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
 # Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
 class MoEGate(nn.Module):
    def __init__(self, embed_dim, num_routed_experts=4, num_activated_experts=2, aux_loss_alpha=0.01, dtype=None, device=None, operations=None):
        super().__init__()
        self.top_k = num_activated_experts
        self.n_routed_experts = num_routed_experts
        self.scoring_func = 'softmax'
        self.alpha = aux_loss_alpha
        self.seq_aux = False
        # topk selection algorithm
        self.norm_topk_prob = False
        self.gating_dim = embed_dim
        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), dtype=dtype, device=device))
        self.reset_parameters()
    def reset_parameters(self) -> None:
        pass
        # import torch.nn.init  as init
        # init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    def forward(self, hidden_states):
        bsz, seq_len, h = hidden_states.shape
        ### compute gating score
        hidden_states = hidden_states.view(-1, h)
        logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), None)
        if self.scoring_func == 'softmax':
            scores = logits.softmax(dim=-1)
        else:
            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
        ### select top-k experts
        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
        ### norm gate to sum 1
        if self.top_k > 1 and self.norm_topk_prob:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        aux_loss = None
        return topk_idx, topk_weight, aux_loss
 # Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
 class MOEFeedForwardSwiGLU(nn.Module):
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        num_routed_experts: int,
        num_activated_experts: int,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.shared_experts = FeedForwardSwiGLU(dim, hidden_dim // 2, dtype=dtype, device=device, operations=operations)
        self.experts = nn.ModuleList([FeedForwardSwiGLU(dim, hidden_dim, dtype=dtype, device=device, operations=operations) for i in range(num_routed_experts)])
        self.gate = MoEGate(
            embed_dim = dim,
            num_routed_experts = num_routed_experts,
            num_activated_experts = num_activated_experts,
            dtype=dtype, device=device, operations=operations
        )
        self.num_activated_experts = num_activated_experts
    def forward(self, x):
        wtype = x.dtype
        identity = x
        orig_shape = x.shape
        topk_idx, topk_weight, aux_loss = self.gate(x)
        x = x.view(-1, x.shape[-1])
        flat_topk_idx = topk_idx.view(-1)
        if True:  # self.training: # TODO: check which branch performs faster
            x = x.repeat_interleave(self.num_activated_experts, dim=0)
            y = torch.empty_like(x, dtype=wtype)
            for i, expert in enumerate(self.experts):
                y[flat_topk_idx == i] = expert(x[flat_topk_idx == i]).to(dtype=wtype)
            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
            y =  y.view(*orig_shape).to(dtype=wtype)
            #y = AddAuxiliaryLoss.apply(y, aux_loss)
        else:
            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
        y = y + self.shared_experts(identity)
        return y
    @torch.no_grad()
    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
        expert_cache = torch.zeros_like(x)
        idxs = flat_expert_indices.argsort()
        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
        token_idxs = idxs // self.num_activated_experts
        for i, end_idx in enumerate(tokens_per_expert):
            start_idx = 0 if i == 0 else tokens_per_expert[i-1]
            if start_idx == end_idx:
                continue
            expert = self.experts[i]
            exp_token_idx = token_idxs[start_idx:end_idx]
            expert_tokens = x[exp_token_idx]
            expert_out = expert(expert_tokens)
            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
            # for fp16 and other dtype
            expert_cache = expert_cache.to(expert_out.dtype)
            expert_cache.scatter_reduce_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
        return expert_cache
 class TextProjection(nn.Module):
    def __init__(self, in_features, hidden_size, dtype=None, device=None, operations=None):
        super().__init__()
        self.linear = operations.Linear(in_features=in_features, out_features=hidden_size, bias=False, dtype=dtype, device=device)
    def forward(self, caption):
        hidden_states = self.linear(caption)
        return hidden_states
 class BlockType:
    TransformerBlock = 1
    SingleTransformerBlock = 2
 class HiDreamImageSingleTransformerBlock(nn.Module):
    def __init__(
        self,
        dim: int,
        num_attention_heads: int,
        attention_head_dim: int,
        num_routed_experts: int = 4,
        num_activated_experts: int = 2,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device)
        )
        # 1. Attention
        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
        self.attn1 = HiDreamAttention(
            query_dim=dim,
            heads=num_attention_heads,
            dim_head=attention_head_dim,
            processor = HiDreamAttnProcessor_flashattn(),
            single = True,
            dtype=dtype, device=device, operations=operations
        )
        # 3. Feed-forward
        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
        if num_routed_experts > 0:
            self.ff_i = MOEFeedForwardSwiGLU(
                dim = dim,
                hidden_dim = 4 * dim,
                num_routed_experts = num_routed_experts,
                num_activated_experts = num_activated_experts,
                dtype=dtype, device=device, operations=operations
            )
        else:
            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
    def forward(
        self,
        image_tokens: torch.FloatTensor,
        image_tokens_masks: Optional[torch.FloatTensor] = None,
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
    ) -> torch.FloatTensor:
        wtype = image_tokens.dtype
        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
            self.adaLN_modulation(adaln_input)[:,None].chunk(6, dim=-1)
        # 1. MM-Attention
        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
        attn_output_i = self.attn1(
            norm_image_tokens,
            image_tokens_masks,
            rope = rope,
        )
        image_tokens = gate_msa_i * attn_output_i + image_tokens
        # 2. Feed-forward
        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens.to(dtype=wtype))
        image_tokens = ff_output_i + image_tokens
        return image_tokens
 class HiDreamImageTransformerBlock(nn.Module):
    def __init__(
        self,
        dim: int,
        num_attention_heads: int,
        attention_head_dim: int,
        num_routed_experts: int = 4,
        num_activated_experts: int = 2,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            operations.Linear(dim, 12 * dim, bias=True, dtype=dtype, device=device)
        )
        # nn.init.zeros_(self.adaLN_modulation[1].weight)
        # nn.init.zeros_(self.adaLN_modulation[1].bias)
        # 1. Attention
        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
        self.norm1_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
        self.attn1 = HiDreamAttention(
            query_dim=dim,
            heads=num_attention_heads,
            dim_head=attention_head_dim,
            processor = HiDreamAttnProcessor_flashattn(),
            single = False,
            dtype=dtype, device=device, operations=operations
        )
        # 3. Feed-forward
        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
        if num_routed_experts > 0:
            self.ff_i = MOEFeedForwardSwiGLU(
                dim = dim,
                hidden_dim = 4 * dim,
                num_routed_experts = num_routed_experts,
                num_activated_experts = num_activated_experts,
                dtype=dtype, device=device, operations=operations
            )
        else:
            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
        self.norm3_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False)
        self.ff_t = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
    def forward(
        self,
        image_tokens: torch.FloatTensor,
        image_tokens_masks: Optional[torch.FloatTensor] = None,
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
    ) -> torch.FloatTensor:
        wtype = image_tokens.dtype
        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
        shift_msa_t, scale_msa_t, gate_msa_t, shift_mlp_t, scale_mlp_t, gate_mlp_t = \
            self.adaLN_modulation(adaln_input)[:,None].chunk(12, dim=-1)
        # 1. MM-Attention
        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
        norm_text_tokens = self.norm1_t(text_tokens).to(dtype=wtype)
        norm_text_tokens = norm_text_tokens * (1 + scale_msa_t) + shift_msa_t
        attn_output_i, attn_output_t = self.attn1(
            norm_image_tokens,
            image_tokens_masks,
            norm_text_tokens,
            rope = rope,
        )
        image_tokens = gate_msa_i * attn_output_i + image_tokens
        text_tokens = gate_msa_t * attn_output_t + text_tokens
        # 2. Feed-forward
        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
        norm_text_tokens = self.norm3_t(text_tokens).to(dtype=wtype)
        norm_text_tokens = norm_text_tokens * (1 + scale_mlp_t) + shift_mlp_t
        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens)
        ff_output_t = gate_mlp_t * self.ff_t(norm_text_tokens)
        image_tokens = ff_output_i + image_tokens
        text_tokens = ff_output_t + text_tokens
        return image_tokens, text_tokens
 class HiDreamImageBlock(nn.Module):
    def __init__(
        self,
        dim: int,
        num_attention_heads: int,
        attention_head_dim: int,
        num_routed_experts: int = 4,
        num_activated_experts: int = 2,
        block_type: BlockType = BlockType.TransformerBlock,
        dtype=None, device=None, operations=None
    ):
        super().__init__()
        block_classes = {
            BlockType.TransformerBlock: HiDreamImageTransformerBlock,
            BlockType.SingleTransformerBlock: HiDreamImageSingleTransformerBlock,
        }
        self.block = block_classes[block_type](
            dim,
            num_attention_heads,
            attention_head_dim,
            num_routed_experts,
            num_activated_experts,
            dtype=dtype, device=device, operations=operations
        )
    def forward(
        self,
        image_tokens: torch.FloatTensor,
        image_tokens_masks: Optional[torch.FloatTensor] = None,
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: torch.FloatTensor = None,
        rope: torch.FloatTensor = None,
    ) -> torch.FloatTensor:
        return self.block(
            image_tokens,
            image_tokens_masks,
            text_tokens,
            adaln_input,
            rope,
        )
 class HiDreamImageTransformer2DModel(nn.Module):
    def __init__(
        self,
        patch_size: Optional[int] = None,
        in_channels: int = 64,
        out_channels: Optional[int] = None,
        num_layers: int = 16,
        num_single_layers: int = 32,
        attention_head_dim: int = 128,
        num_attention_heads: int = 20,
        caption_channels: List[int] = None,
        text_emb_dim: int = 2048,
        num_routed_experts: int = 4,
        num_activated_experts: int = 2,
        axes_dims_rope: Tuple[int, int] = (32, 32),
        max_resolution: Tuple[int, int] = (128, 128),
        llama_layers: List[int] = None,
        image_model=None,
        dtype=None, device=None, operations=None
    ):
        self.patch_size = patch_size
        self.num_attention_heads = num_attention_heads
        self.attention_head_dim = attention_head_dim
        self.num_layers = num_layers
        self.num_single_layers = num_single_layers
        self.gradient_checkpointing = False
        super().__init__()
        self.dtype = dtype
        self.out_channels = out_channels or in_channels
        self.inner_dim = self.num_attention_heads * self.attention_head_dim
        self.llama_layers = llama_layers
        self.t_embedder = TimestepEmbed(self.inner_dim, dtype=dtype, device=device, operations=operations)
        self.p_embedder = PooledEmbed(text_emb_dim, self.inner_dim, dtype=dtype, device=device, operations=operations)
        self.x_embedder = PatchEmbed(
            patch_size = patch_size,
            in_channels = in_channels,
            out_channels = self.inner_dim,
            dtype=dtype, device=device, operations=operations
        )
        self.pe_embedder = EmbedND(theta=10000, axes_dim=axes_dims_rope)
        self.double_stream_blocks = nn.ModuleList(
            [
                HiDreamImageBlock(
                    dim = self.inner_dim,
                    num_attention_heads = self.num_attention_heads,
                    attention_head_dim = self.attention_head_dim,
                    num_routed_experts = num_routed_experts,
                    num_activated_experts = num_activated_experts,
                    block_type = BlockType.TransformerBlock,
                    dtype=dtype, device=device, operations=operations
                )
                for i in range(self.num_layers)
            ]
        )
        self.single_stream_blocks = nn.ModuleList(
            [
                HiDreamImageBlock(
                    dim = self.inner_dim,
                    num_attention_heads = self.num_attention_heads,
                    attention_head_dim = self.attention_head_dim,
                    num_routed_experts = num_routed_experts,
                    num_activated_experts = num_activated_experts,
                    block_type = BlockType.SingleTransformerBlock,
                    dtype=dtype, device=device, operations=operations
                )
                for i in range(self.num_single_layers)
            ]
        )
        self.final_layer = LastLayer(self.inner_dim, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
        caption_channels = [caption_channels[1], ] * (num_layers + num_single_layers) + [caption_channels[0], ]
        caption_projection = []
        for caption_channel in caption_channels:
            caption_projection.append(TextProjection(in_features=caption_channel, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations))
        self.caption_projection = nn.ModuleList(caption_projection)
        self.max_seq = max_resolution[0] * max_resolution[1] // (patch_size * patch_size)
    def expand_timesteps(self, timesteps, batch_size, device):
        if not torch.is_tensor(timesteps):
            is_mps = device.type == "mps"
            if isinstance(timesteps, float):
                dtype = torch.float32 if is_mps else torch.float64
            else:
                dtype = torch.int32 if is_mps else torch.int64
            timesteps = torch.tensor([timesteps], dtype=dtype, device=device)
        elif len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(device)
        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        timesteps = timesteps.expand(batch_size)
        return timesteps
    def unpatchify(self, x: torch.Tensor, img_sizes: List[Tuple[int, int]]) -> List[torch.Tensor]:
        x_arr = []
        for i, img_size in enumerate(img_sizes):
            pH, pW = img_size
            x_arr.append(
                einops.rearrange(x[i, :pH*pW].reshape(1, pH, pW, -1), 'B H W (p1 p2 C) -> B C (H p1) (W p2)',
                    p1=self.patch_size, p2=self.patch_size)
            )
        x = torch.cat(x_arr, dim=0)
        return x
    def patchify(self, x, max_seq, img_sizes=None):
        pz2 = self.patch_size * self.patch_size
        if isinstance(x, torch.Tensor):
            B = x.shape[0]
            device = x.device
            dtype = x.dtype
        else:
            B = len(x)
            device = x[0].device
            dtype = x[0].dtype
        x_masks = torch.zeros((B, max_seq), dtype=dtype, device=device)
        if img_sizes is not None:
            for i, img_size in enumerate(img_sizes):
                x_masks[i, 0:img_size[0] * img_size[1]] = 1
            x = einops.rearrange(x, 'B C S p -> B S (p C)', p=pz2)
        elif isinstance(x, torch.Tensor):
            pH, pW = x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size
            x = einops.rearrange(x, 'B C (H p1) (W p2) -> B (H W) (p1 p2 C)', p1=self.patch_size, p2=self.patch_size)
            img_sizes = [[pH, pW]] * B
            x_masks = None
        else:
            raise NotImplementedError
        return x, x_masks, img_sizes
    def forward(
        self,
        x: torch.Tensor,
        t: torch.Tensor,
        y: Optional[torch.Tensor] = None,
        context: Optional[torch.Tensor] = None,
        encoder_hidden_states_llama3=None,
        image_cond=None,
        control = None,
        transformer_options = {},
    ) -> torch.Tensor:
        bs, c, h, w = x.shape
        if image_cond is not None:
            x = torch.cat([x, image_cond], dim=-1)
        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
        timesteps = t
        pooled_embeds = y
        T5_encoder_hidden_states = context
        img_sizes = None
        # spatial forward
        batch_size = hidden_states.shape[0]
        hidden_states_type = hidden_states.dtype
        # 0. time
        timesteps = self.expand_timesteps(timesteps, batch_size, hidden_states.device)
        timesteps = self.t_embedder(timesteps, hidden_states_type)
        p_embedder = self.p_embedder(pooled_embeds)
        adaln_input = timesteps + p_embedder
        hidden_states, image_tokens_masks, img_sizes = self.patchify(hidden_states, self.max_seq, img_sizes)
        if image_tokens_masks is None:
            pH, pW = img_sizes[0]
            img_ids = torch.zeros(pH, pW, 3, device=hidden_states.device)
            img_ids[..., 1] = img_ids[..., 1] + torch.arange(pH, device=hidden_states.device)[:, None]
            img_ids[..., 2] = img_ids[..., 2] + torch.arange(pW, device=hidden_states.device)[None, :]
            img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
        hidden_states = self.x_embedder(hidden_states)
        # T5_encoder_hidden_states = encoder_hidden_states[0]
        encoder_hidden_states = encoder_hidden_states_llama3.movedim(1, 0)
        encoder_hidden_states = [encoder_hidden_states[k] for k in self.llama_layers]
        if self.caption_projection is not None:
            new_encoder_hidden_states = []
            for i, enc_hidden_state in enumerate(encoder_hidden_states):
                enc_hidden_state = self.caption_projection[i](enc_hidden_state)
                enc_hidden_state = enc_hidden_state.view(batch_size, -1, hidden_states.shape[-1])
                new_encoder_hidden_states.append(enc_hidden_state)
            encoder_hidden_states = new_encoder_hidden_states
            T5_encoder_hidden_states = self.caption_projection[-1](T5_encoder_hidden_states)
            T5_encoder_hidden_states = T5_encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
            encoder_hidden_states.append(T5_encoder_hidden_states)
        txt_ids = torch.zeros(
            batch_size,
            encoder_hidden_states[-1].shape[1] + encoder_hidden_states[-2].shape[1] + encoder_hidden_states[0].shape[1],
            3,
            device=img_ids.device, dtype=img_ids.dtype
        )
        ids = torch.cat((img_ids, txt_ids), dim=1)
        rope = self.pe_embedder(ids)
        # 2. Blocks
        block_id = 0
        initial_encoder_hidden_states = torch.cat([encoder_hidden_states[-1], encoder_hidden_states[-2]], dim=1)
        initial_encoder_hidden_states_seq_len = initial_encoder_hidden_states.shape[1]
        for bid, block in enumerate(self.double_stream_blocks):
            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
            cur_encoder_hidden_states = torch.cat([initial_encoder_hidden_states, cur_llama31_encoder_hidden_states], dim=1)
            hidden_states, initial_encoder_hidden_states = block(
                image_tokens = hidden_states,
                image_tokens_masks = image_tokens_masks,
                text_tokens = cur_encoder_hidden_states,
                adaln_input = adaln_input,
                rope = rope,
            )
            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
            block_id += 1
        image_tokens_seq_len = hidden_states.shape[1]
        hidden_states = torch.cat([hidden_states, initial_encoder_hidden_states], dim=1)
        hidden_states_seq_len = hidden_states.shape[1]
        if image_tokens_masks is not None:
            encoder_attention_mask_ones = torch.ones(
                (batch_size, initial_encoder_hidden_states.shape[1] + cur_llama31_encoder_hidden_states.shape[1]),
                device=image_tokens_masks.device, dtype=image_tokens_masks.dtype
            )
            image_tokens_masks = torch.cat([image_tokens_masks, encoder_attention_mask_ones], dim=1)
        for bid, block in enumerate(self.single_stream_blocks):
            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
            hidden_states = torch.cat([hidden_states, cur_llama31_encoder_hidden_states], dim=1)
            hidden_states = block(
                image_tokens=hidden_states,
                image_tokens_masks=image_tokens_masks,
                text_tokens=None,
                adaln_input=adaln_input,
                rope=rope,
            )
            hidden_states = hidden_states[:, :hidden_states_seq_len]
            block_id += 1
        hidden_states = hidden_states[:, :image_tokens_seq_len, ...]
        output = self.final_layer(hidden_states, adaln_input)
        output = self.unpatchify(output, img_sizes)
        return -output[:, :, :h, :w]
--- a/comfy/ldm/hunyuan3d/model.py
+++ b/comfy/ldm/hunyuan3d/model.py
@@ -1,135 +0,0 @@
 import torch
 from torch import nn
 from comfy.ldm.flux.layers import (
    DoubleStreamBlock,
    LastLayer,
    MLPEmbedder,
    SingleStreamBlock,
    timestep_embedding,
 )
 class Hunyuan3Dv2(nn.Module):
    def __init__(
        self,
        in_channels=64,
        context_in_dim=1536,
        hidden_size=1024,
        mlp_ratio=4.0,
        num_heads=16,
        depth=16,
        depth_single_blocks=32,
        qkv_bias=True,
        guidance_embed=False,
        image_model=None,
        dtype=None,
        device=None,
        operations=None
    ):
        super().__init__()
        self.dtype = dtype
        if hidden_size % num_heads != 0:
            raise ValueError(
                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
            )
        self.max_period = 1000  # While reimplementing the model I noticed that they messed up. This 1000 value was meant to be the time_factor but they set the max_period instead
        self.latent_in = operations.Linear(in_channels, hidden_size, bias=True, dtype=dtype, device=device)
        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations)
        self.guidance_in = (
            MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations) if guidance_embed else None
        )
        self.cond_in = operations.Linear(context_in_dim, hidden_size, dtype=dtype, device=device)
        self.double_blocks = nn.ModuleList(
            [
                DoubleStreamBlock(
                    hidden_size,
                    num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(depth)
            ]
        )
        self.single_blocks = nn.ModuleList(
            [
                SingleStreamBlock(
                    hidden_size,
                    num_heads,
                    mlp_ratio=mlp_ratio,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(depth_single_blocks)
            ]
        )
        self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations)
    def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
        x = x.movedim(-1, -2)
        timestep = 1.0 - timestep
        txt = context
        img = self.latent_in(x)
        vec = self.time_in(timestep_embedding(timestep, 256, self.max_period).to(dtype=img.dtype))
        if self.guidance_in is not None:
            if guidance is not None:
                vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.max_period).to(img.dtype))
        txt = self.cond_in(txt)
        pe = None
        attn_mask = None
        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.double_blocks):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
                    out["img"], out["txt"] = block(img=args["img"],
                                                   txt=args["txt"],
                                                   vec=args["vec"],
                                                   pe=args["pe"],
                                                   attn_mask=args.get("attn_mask"))
                    return out
                out = blocks_replace[("double_block", i)]({"img": img,
                                                           "txt": txt,
                                                           "vec": vec,
                                                           "pe": pe,
                                                           "attn_mask": attn_mask},
                                                          {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
                img, txt = block(img=img,
                                 txt=txt,
                                 vec=vec,
                                 pe=pe,
                                 attn_mask=attn_mask)
        img = torch.cat((txt, img), 1)
        for i, block in enumerate(self.single_blocks):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
                    out["img"] = block(args["img"],
                                       vec=args["vec"],
                                       pe=args["pe"],
                                       attn_mask=args.get("attn_mask"))
                    return out
                out = blocks_replace[("single_block", i)]({"img": img,
                                                           "vec": vec,
                                                           "pe": pe,
                                                           "attn_mask": attn_mask},
                                                          {"original_block": block_wrap})
                img = out["img"]
            else:
                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
        img = img[:, txt.shape[1]:, ...]
        img = self.final_layer(img, vec)
        return img.movedim(-2, -1) * (-1.0)
--- a/comfy/ldm/hunyuan3d/vae.py
+++ b/comfy/ldm/hunyuan3d/vae.py
@@ -1,587 +0,0 @@
 # Original: https://github.com/Tencent/Hunyuan3D-2/blob/main/hy3dgen/shapegen/models/autoencoders/model.py
 # Since the header on their VAE source file was a bit confusing we asked for permission to use this code from tencent under the GPL license used in ComfyUI.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Union, Tuple, List, Callable, Optional
 import numpy as np
 from einops import repeat, rearrange
 from tqdm import tqdm
 import logging
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 def generate_dense_grid_points(
    bbox_min: np.ndarray,
    bbox_max: np.ndarray,
    octree_resolution: int,
    indexing: str = "ij",
 ):
    length = bbox_max - bbox_min
    num_cells = octree_resolution
    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
    xyz = np.stack((xs, ys, zs), axis=-1)
    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
    return xyz, grid_size, length
 class VanillaVolumeDecoder:
    @torch.no_grad()
    def __call__(
        self,
        latents: torch.FloatTensor,
        geo_decoder: Callable,
        bounds: Union[Tuple[float], List[float], float] = 1.01,
        num_chunks: int = 10000,
        octree_resolution: int = None,
        enable_pbar: bool = True,
        **kwargs,
    ):
        device = latents.device
        dtype = latents.dtype
        batch_size = latents.shape[0]
        # 1. generate query points
        if isinstance(bounds, float):
            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
        xyz_samples, grid_size, length = generate_dense_grid_points(
            bbox_min=bbox_min,
            bbox_max=bbox_max,
            octree_resolution=octree_resolution,
            indexing="ij"
        )
        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
        # 2. latents to 3d volume
        batch_logits = []
        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding",
                          disable=not enable_pbar):
            chunk_queries = xyz_samples[start: start + num_chunks, :]
            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
            logits = geo_decoder(queries=chunk_queries, latents=latents)
            batch_logits.append(logits)
        grid_logits = torch.cat(batch_logits, dim=1)
        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
        return grid_logits
 class FourierEmbedder(nn.Module):
    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
    each feature dimension of `x[..., i]` into:
        [
            sin(x[..., i]),
            sin(f_1*x[..., i]),
            sin(f_2*x[..., i]),
            ...
            sin(f_N * x[..., i]),
            cos(x[..., i]),
            cos(f_1*x[..., i]),
            cos(f_2*x[..., i]),
            ...
            cos(f_N * x[..., i]),
            x[..., i]     # only present if include_input is True.
        ], here f_i is the frequency.
    Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
    If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
    Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
    Args:
        num_freqs (int): the number of frequencies, default is 6;
        logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
            otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
        input_dim (int): the input dimension, default is 3;
        include_input (bool): include the input tensor or not, default is True.
    Attributes:
        frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
                otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
        out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
            otherwise, it is input_dim * num_freqs * 2.
    """
    def __init__(self,
                 num_freqs: int = 6,
                 logspace: bool = True,
                 input_dim: int = 3,
                 include_input: bool = True,
                 include_pi: bool = True) -> None:
        """The initialization"""
        super().__init__()
        if logspace:
            frequencies = 2.0 ** torch.arange(
                num_freqs,
                dtype=torch.float32
            )
        else:
            frequencies = torch.linspace(
                1.0,
                2.0 ** (num_freqs - 1),
                num_freqs,
                dtype=torch.float32
            )
        if include_pi:
            frequencies *= torch.pi
        self.register_buffer("frequencies", frequencies, persistent=False)
        self.include_input = include_input
        self.num_freqs = num_freqs
        self.out_dim = self.get_dims(input_dim)
    def get_dims(self, input_dim):
        temp = 1 if self.include_input or self.num_freqs == 0 else 0
        out_dim = input_dim * (self.num_freqs * 2 + temp)
        return out_dim
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """ Forward process.
        Args:
            x: tensor of shape [..., dim]
        Returns:
            embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
                where temp is 1 if include_input is True and 0 otherwise.
        """
        if self.num_freqs > 0:
            embed = (x[..., None].contiguous() * self.frequencies.to(device=x.device, dtype=x.dtype)).view(*x.shape[:-1], -1)
            if self.include_input:
                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
            else:
                return torch.cat((embed.sin(), embed.cos()), dim=-1)
        else:
            return x
 class CrossAttentionProcessor:
    def __call__(self, attn, q, k, v):
        out = F.scaled_dot_product_attention(q, k, v)
        return out
 class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
        self.scale_by_keep = scale_by_keep
    def forward(self, x):
        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
        'survival rate' as the argument.
        """
        if self.drop_prob == 0. or not self.training:
            return x
        keep_prob = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
        if keep_prob > 0.0 and self.scale_by_keep:
            random_tensor.div_(keep_prob)
        return x * random_tensor
    def extra_repr(self):
        return f'drop_prob={round(self.drop_prob, 3):0.3f}'
 class MLP(nn.Module):
    def __init__(
        self, *,
        width: int,
        expand_ratio: int = 4,
        output_width: int = None,
        drop_path_rate: float = 0.0
    ):
        super().__init__()
        self.width = width
        self.c_fc = ops.Linear(width, width * expand_ratio)
        self.c_proj = ops.Linear(width * expand_ratio, output_width if output_width is not None else width)
        self.gelu = nn.GELU()
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
    def forward(self, x):
        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
 class QKVMultiheadCrossAttention(nn.Module):
    def __init__(
        self,
        *,
        heads: int,
        width=None,
        qk_norm=False,
        norm_layer=ops.LayerNorm
    ):
        super().__init__()
        self.heads = heads
        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
        self.attn_processor = CrossAttentionProcessor()
    def forward(self, q, kv):
        _, n_ctx, _ = q.shape
        bs, n_data, width = kv.shape
        attn_ch = width // self.heads // 2
        q = q.view(bs, n_ctx, self.heads, -1)
        kv = kv.view(bs, n_data, self.heads, -1)
        k, v = torch.split(kv, attn_ch, dim=-1)
        q = self.q_norm(q)
        k = self.k_norm(k)
        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
        out = self.attn_processor(self, q, k, v)
        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
        return out
 class MultiheadCrossAttention(nn.Module):
    def __init__(
        self,
        *,
        width: int,
        heads: int,
        qkv_bias: bool = True,
        data_width: Optional[int] = None,
        norm_layer=ops.LayerNorm,
        qk_norm: bool = False,
        kv_cache: bool = False,
    ):
        super().__init__()
        self.width = width
        self.heads = heads
        self.data_width = width if data_width is None else data_width
        self.c_q = ops.Linear(width, width, bias=qkv_bias)
        self.c_kv = ops.Linear(self.data_width, width * 2, bias=qkv_bias)
        self.c_proj = ops.Linear(width, width)
        self.attention = QKVMultiheadCrossAttention(
            heads=heads,
            width=width,
            norm_layer=norm_layer,
            qk_norm=qk_norm
        )
        self.kv_cache = kv_cache
        self.data = None
    def forward(self, x, data):
        x = self.c_q(x)
        if self.kv_cache:
            if self.data is None:
                self.data = self.c_kv(data)
                logging.info('Save kv cache,this should be called only once for one mesh')
            data = self.data
        else:
            data = self.c_kv(data)
        x = self.attention(x, data)
        x = self.c_proj(x)
        return x
 class ResidualCrossAttentionBlock(nn.Module):
    def __init__(
        self,
        *,
        width: int,
        heads: int,
        mlp_expand_ratio: int = 4,
        data_width: Optional[int] = None,
        qkv_bias: bool = True,
        norm_layer=ops.LayerNorm,
        qk_norm: bool = False
    ):
        super().__init__()
        if data_width is None:
            data_width = width
        self.attn = MultiheadCrossAttention(
            width=width,
            heads=heads,
            data_width=data_width,
            qkv_bias=qkv_bias,
            norm_layer=norm_layer,
            qk_norm=qk_norm
        )
        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
        self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
        self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
        self.mlp = MLP(width=width, expand_ratio=mlp_expand_ratio)
    def forward(self, x: torch.Tensor, data: torch.Tensor):
        x = x + self.attn(self.ln_1(x), self.ln_2(data))
        x = x + self.mlp(self.ln_3(x))
        return x
 class QKVMultiheadAttention(nn.Module):
    def __init__(
        self,
        *,
        heads: int,
        width=None,
        qk_norm=False,
        norm_layer=ops.LayerNorm
    ):
        super().__init__()
        self.heads = heads
        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
    def forward(self, qkv):
        bs, n_ctx, width = qkv.shape
        attn_ch = width // self.heads // 3
        qkv = qkv.view(bs, n_ctx, self.heads, -1)
        q, k, v = torch.split(qkv, attn_ch, dim=-1)
        q = self.q_norm(q)
        k = self.k_norm(k)
        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
        return out
 class MultiheadAttention(nn.Module):
    def __init__(
        self,
        *,
        width: int,
        heads: int,
        qkv_bias: bool,
        norm_layer=ops.LayerNorm,
        qk_norm: bool = False,
        drop_path_rate: float = 0.0
    ):
        super().__init__()
        self.width = width
        self.heads = heads
        self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias)
        self.c_proj = ops.Linear(width, width)
        self.attention = QKVMultiheadAttention(
            heads=heads,
            width=width,
            norm_layer=norm_layer,
            qk_norm=qk_norm
        )
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
    def forward(self, x):
        x = self.c_qkv(x)
        x = self.attention(x)
        x = self.drop_path(self.c_proj(x))
        return x
 class ResidualAttentionBlock(nn.Module):
    def __init__(
        self,
        *,
        width: int,
        heads: int,
        qkv_bias: bool = True,
        norm_layer=ops.LayerNorm,
        qk_norm: bool = False,
        drop_path_rate: float = 0.0,
    ):
        super().__init__()
        self.attn = MultiheadAttention(
            width=width,
            heads=heads,
            qkv_bias=qkv_bias,
            norm_layer=norm_layer,
            qk_norm=qk_norm,
            drop_path_rate=drop_path_rate
        )
        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
        self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
        self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
    def forward(self, x: torch.Tensor):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x
 class Transformer(nn.Module):
    def __init__(
        self,
        *,
        width: int,
        layers: int,
        heads: int,
        qkv_bias: bool = True,
        norm_layer=ops.LayerNorm,
        qk_norm: bool = False,
        drop_path_rate: float = 0.0
    ):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.ModuleList(
            [
                ResidualAttentionBlock(
                    width=width,
                    heads=heads,
                    qkv_bias=qkv_bias,
                    norm_layer=norm_layer,
                    qk_norm=qk_norm,
                    drop_path_rate=drop_path_rate
                )
                for _ in range(layers)
            ]
        )
    def forward(self, x: torch.Tensor):
        for block in self.resblocks:
            x = block(x)
        return x
 class CrossAttentionDecoder(nn.Module):
    def __init__(
        self,
        *,
        out_channels: int,
        fourier_embedder: FourierEmbedder,
        width: int,
        heads: int,
        mlp_expand_ratio: int = 4,
        downsample_ratio: int = 1,
        enable_ln_post: bool = True,
        qkv_bias: bool = True,
        qk_norm: bool = False,
        label_type: str = "binary"
    ):
        super().__init__()
        self.enable_ln_post = enable_ln_post
        self.fourier_embedder = fourier_embedder
        self.downsample_ratio = downsample_ratio
        self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width)
        if self.downsample_ratio != 1:
            self.latents_proj = ops.Linear(width * downsample_ratio, width)
        if self.enable_ln_post == False:
            qk_norm = False
        self.cross_attn_decoder = ResidualCrossAttentionBlock(
            width=width,
            mlp_expand_ratio=mlp_expand_ratio,
            heads=heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm
        )
        if self.enable_ln_post:
            self.ln_post = ops.LayerNorm(width)
        self.output_proj = ops.Linear(width, out_channels)
        self.label_type = label_type
        self.count = 0
    def forward(self, queries=None, query_embeddings=None, latents=None):
        if query_embeddings is None:
            query_embeddings = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
        self.count += query_embeddings.shape[1]
        if self.downsample_ratio != 1:
            latents = self.latents_proj(latents)
        x = self.cross_attn_decoder(query_embeddings, latents)
        if self.enable_ln_post:
            x = self.ln_post(x)
        occ = self.output_proj(x)
        return occ
 class ShapeVAE(nn.Module):
    def __init__(
        self,
        *,
        embed_dim: int,
        width: int,
        heads: int,
        num_decoder_layers: int,
        geo_decoder_downsample_ratio: int = 1,
        geo_decoder_mlp_expand_ratio: int = 4,
        geo_decoder_ln_post: bool = True,
        num_freqs: int = 8,
        include_pi: bool = True,
        qkv_bias: bool = True,
        qk_norm: bool = False,
        label_type: str = "binary",
        drop_path_rate: float = 0.0,
        scale_factor: float = 1.0,
    ):
        super().__init__()
        self.geo_decoder_ln_post = geo_decoder_ln_post
        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
        self.post_kl = ops.Linear(embed_dim, width)
        self.transformer = Transformer(
            width=width,
            layers=num_decoder_layers,
            heads=heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            drop_path_rate=drop_path_rate
        )
        self.geo_decoder = CrossAttentionDecoder(
            fourier_embedder=self.fourier_embedder,
            out_channels=1,
            mlp_expand_ratio=geo_decoder_mlp_expand_ratio,
            downsample_ratio=geo_decoder_downsample_ratio,
            enable_ln_post=self.geo_decoder_ln_post,
            width=width // geo_decoder_downsample_ratio,
            heads=heads // geo_decoder_downsample_ratio,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            label_type=label_type,
        )
        self.volume_decoder = VanillaVolumeDecoder()
        self.scale_factor = scale_factor
    def decode(self, latents, **kwargs):
        latents = self.post_kl(latents.movedim(-2, -1))
        latents = self.transformer(latents)
        bounds = kwargs.get("bounds", 1.01)
        num_chunks = kwargs.get("num_chunks", 8000)
        octree_resolution = kwargs.get("octree_resolution", 256)
        enable_pbar = kwargs.get("enable_pbar", True)
        grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar)
        return grid_logits.movedim(-2, -1)
    def encode(self, x):
        return None
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -227,8 +227,6 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
        guiding_frame_index=None,
        ref_latent=None,
        control=None,
        transformer_options={},
    ) -> Tensor:
@@ -239,25 +237,7 @@ class HunyuanVideo(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))
-        if ref_latent is not None:
+        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
            ref_latent_ids = self.img_ids(ref_latent)
            ref_latent = self.img_in(ref_latent)
            img = torch.cat([ref_latent, img], dim=-2)
            ref_latent_ids[..., 0] = -1
            ref_latent_ids[..., 2] += (initial_shape[-1] // self.patch_size[-1])
            img_ids = torch.cat([ref_latent_ids, img_ids], dim=-2)
        if guiding_frame_index is not None:
            token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0))
            vec_ = self.vector_in(y[:, :self.params.vec_in_dim])
            vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1)
            frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2])
            modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)]
            modulation_dims_txt = [(0, None, 1)]
        else:
            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
            modulation_dims = None
            modulation_dims_txt = None
        if self.params.guidance_embed:
            if guidance is not None:
@@ -284,14 +264,14 @@ class HunyuanVideo(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out
-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask)
            if control is not None: # Controlnet
                control_i = control.get("input")
@@ -306,13 +286,13 @@ class HunyuanVideo(nn.Module):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out
-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
            if control is not None: # Controlnet
                control_o = control.get("output")
@@ -322,10 +302,8 @@ class HunyuanVideo(nn.Module):
                        img[:, : img_len] += add
        img = img[:, : img_len]
        if ref_latent is not None:
            img = img[:, ref_latent.shape[1]:]
-        img = self.final_layer(img, vec, modulation_dims=modulation_dims)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        shape = initial_shape[-3:]
        for i in range(len(shape)):
@@ -335,7 +313,7 @@ class HunyuanVideo(nn.Module):
        img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4])
        return img
-    def img_ids(self, x):
+    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, control=None, transformer_options={}, **kwargs):
        bs, c, t, h, w = x.shape
        patch_size = self.patch_size
        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
@@ -345,11 +323,7 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
-        return repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
+        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs):
        bs, c, t, h, w = x.shape
        img_ids = self.img_ids(x)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, ref_latent, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, control, transformer_options)
        return out
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn
 import comfy.ops
-from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed
+from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
 from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
 from torch.utils import checkpoint
@@ -51,7 +51,7 @@ class HunYuanDiTBlock(nn.Module):
        if norm_type == "layer":
            norm_layer = operations.LayerNorm
        elif norm_type == "rms":
-            norm_layer = operations.RMSNorm
+            norm_layer = RMSNorm
        else:
            raise ValueError(f"Unknown norm_type: {norm_type}")
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@@ -1,12 +1,13 @@
 import torch
 from torch import nn
 import comfy.ldm.modules.attention
 from comfy.ldm.genmo.joint_model.layers import RMSNorm
 import comfy.ldm.common_dit
 from einops import rearrange
 import math
 from typing import Dict, Optional, Tuple
-from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
+from .symmetric_patchifier import SymmetricPatchifier
 def get_timestep_embedding(
@@ -261,8 +262,8 @@ class CrossAttention(nn.Module):
        self.heads = heads
        self.dim_head = dim_head
-        self.q_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.q_norm = RMSNorm(inner_dim, dtype=dtype, device=device)
-        self.k_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.k_norm = RMSNorm(inner_dim, dtype=dtype, device=device)
        self.to_q = operations.Linear(query_dim, inner_dim, bias=True, dtype=dtype, device=device)
        self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
@@ -376,16 +377,12 @@ class LTXVModel(torch.nn.Module):
                 positional_embedding_theta=10000.0,
                 positional_embedding_max_pos=[20, 2048, 2048],
                 causal_temporal_positioning=False,
                 vae_scale_factors=(8, 32, 32),
                 dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.generator = None
        self.vae_scale_factors = vae_scale_factors
        self.dtype = dtype
        self.out_channels = in_channels
        self.inner_dim = num_attention_heads * attention_head_dim
        self.causal_temporal_positioning = causal_temporal_positioning
        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
@@ -419,23 +416,42 @@ class LTXVModel(torch.nn.Module):
        self.patchifier = SymmetricPatchifier(1)
-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, guiding_latent_noise_scale=0, transformer_options={}, **kwargs):
        patches_replace = transformer_options.get("patches_replace", {})
        indices_grid = self.patchifier.get_grid(
            orig_num_frames=x.shape[2],
            orig_height=x.shape[3],
            orig_width=x.shape[4],
            batch_size=x.shape[0],
            scale_grid=((1 / frame_rate) * 8, 32, 32),
            device=x.device,
        )
        if guiding_latent is not None:
            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
            ts *= input_ts
            ts[:, :, 0] = guiding_latent_noise_scale * (input_ts[:, :, 0] ** 2)
            timestep = self.patchifier.patchify(ts)
            input_x = x.clone()
            x[:, :, 0] = guiding_latent[:, :, 0]
            if guiding_latent_noise_scale > 0:
                if self.generator is None:
                    self.generator = torch.Generator(device=x.device).manual_seed(42)
                elif self.generator.device != x.device:
                    self.generator = torch.Generator(device=x.device).set_state(self.generator.get_state())
                noise_shape = [guiding_latent.shape[0], guiding_latent.shape[1], 1, guiding_latent.shape[3], guiding_latent.shape[4]]
                scale = guiding_latent_noise_scale * (input_ts ** 2)
                guiding_noise = scale * torch.randn(size=noise_shape, device=x.device, generator=self.generator)
                x[:, :, 0] = guiding_noise[:, :, 0] + x[:, :, 0] *  (1.0 - scale[:, :, 0])
        orig_shape = list(x.shape)
-        x, latent_coords = self.patchifier.patchify(x)
+        x = self.patchifier.patchify(x)
        pixel_coords = latent_to_pixel_coords(
            latent_coords=latent_coords,
            scale_factors=self.vae_scale_factors,
            causal_fix=self.causal_temporal_positioning,
        )
        if keyframe_idxs is not None:
            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
        fractional_coords = pixel_coords.to(torch.float32)
        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
        x = self.patchify_proj(x)
        timestep = timestep * 1000.0
@@ -443,7 +459,7 @@ class LTXVModel(torch.nn.Module):
        if attention_mask is not None and not torch.is_floating_point(attention_mask):
            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max
-        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
+        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)
        batch_size = x.shape[0]
        timestep, embedded_timestep = self.adaln_single(
@@ -503,4 +519,8 @@ class LTXVModel(torch.nn.Module):
            out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
        )
        if guiding_latent is not None:
            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
        # print("res", x)
        return x
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@@ -6,29 +6,16 @@ from einops import rearrange
 from torch import Tensor
-def latent_to_pixel_coords(
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
-    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
-) -> Tensor:
+    dims_to_append = target_dims - x.ndim
-    """
+    if dims_to_append < 0:
-    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
+        raise ValueError(
-    configuration.
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
-    Args:
+        )
-        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
+    elif dims_to_append == 0:
-        containing the latent corner coordinates of each token.
+        return x
-        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
+    return x[(...,) + (None,) * dims_to_append]
        causal_fix (bool): Whether to take into account the different temporal scale
            of the first frame. Default = False for backwards compatibility.
    Returns:
        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
    """
    pixel_coords = (
        latent_coords
        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
    )
    if causal_fix:
        # Fix temporal scale for first frame to 1 due to causality
        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
    return pixel_coords
 class Patchifier(ABC):
@@ -57,26 +44,29 @@ class Patchifier(ABC):
    def patch_size(self):
        return self._patch_size
-    def get_latent_coords(
+    def get_grid(
-        self, latent_num_frames, latent_height, latent_width, batch_size, device
+        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
    ):
-        """
+        f = orig_num_frames // self._patch_size[0]
-        Return a tensor of shape [batch_size, 3, num_patches] containing the
+        h = orig_height // self._patch_size[1]
-            top-left corner latent coordinates of each latent patch.
+        w = orig_width // self._patch_size[2]
-        The tensor is repeated for each batch element.
+        grid_h = torch.arange(h, dtype=torch.float32, device=device)
-        """
+        grid_w = torch.arange(w, dtype=torch.float32, device=device)
-        latent_sample_coords = torch.meshgrid(
+        grid_f = torch.arange(f, dtype=torch.float32, device=device)
-            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+        grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing='ij')
-            torch.arange(0, latent_height, self._patch_size[1], device=device),
+        grid = torch.stack(grid, dim=0)
-            torch.arange(0, latent_width, self._patch_size[2], device=device),
+        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-            indexing="ij",
+
-        )
+        if scale_grid is not None:
-        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+            for i in range(3):
-        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+                if isinstance(scale_grid[i], Tensor):
-        latent_coords = rearrange(
+                    scale = append_dims(scale_grid[i], grid.ndim - 1)
-            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+                else:
-        )
+                    scale = scale_grid[i]
-        return latent_coords
+                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
        return grid
 class SymmetricPatchifier(Patchifier):
@@ -84,8 +74,6 @@ class SymmetricPatchifier(Patchifier):
        self,
        latents: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        b, _, f, h, w = latents.shape
        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
        latents = rearrange(
            latents,
            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
@@ -93,7 +81,7 @@ class SymmetricPatchifier(Patchifier):
            p2=self._patch_size[1],
            p3=self._patch_size[2],
        )
-        return latents, latent_coords
+        return latents
    def unpatchify(
        self,
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@@ -15,7 +15,6 @@ class CausalConv3d(nn.Module):
        stride: Union[int, Tuple[int]] = 1,
        dilation: int = 1,
        groups: int = 1,
        spatial_padding_mode: str = "zeros",
        **kwargs,
    ):
        super().__init__()
@@ -39,7 +38,7 @@ class CausalConv3d(nn.Module):
            stride=stride,
            dilation=dilation,
            padding=padding,
-            padding_mode=spatial_padding_mode,
+            padding_mode="zeros",
            groups=groups,
        )
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -1,15 +1,13 @@
 from __future__ import annotations
 import torch
 from torch import nn
 from functools import partial
 import math
 from einops import rearrange
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 class Encoder(nn.Module):
@@ -34,7 +32,7 @@ class Encoder(nn.Module):
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        latent_log_var (`str`, *optional*, defaults to `per_channel`):
-            The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
    """
    def __init__(
@@ -42,13 +40,12 @@ class Encoder(nn.Module):
        dims: Union[int, Tuple[int, int]] = 3,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        norm_num_groups: int = 32,
        patch_size: Union[int, Tuple[int]] = 1,
        norm_layer: str = "group_norm",  # group_norm, pixel_norm
        latent_log_var: str = "per_channel",
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -68,7 +65,6 @@ class Encoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.down_blocks = nn.ModuleList([])
@@ -86,7 +82,6 @@ class Encoder(nn.Module):
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -97,7 +92,6 @@ class Encoder(nn.Module):
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = make_conv_nd(
@@ -107,7 +101,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 1, 1),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = make_conv_nd(
@@ -117,7 +110,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(1, 2, 2),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                block = make_conv_nd(
@@ -127,7 +119,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -138,34 +129,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_res":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = SpaceToDepthDownsample(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    stride=(2, 2, 2),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space_res":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = SpaceToDepthDownsample(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    stride=(1, 2, 2),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time_res":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = SpaceToDepthDownsample(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    stride=(2, 1, 1),
                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown block: {block_name}")
@@ -189,18 +152,10 @@ class Encoder(nn.Module):
            conv_out_channels *= 2
        elif latent_log_var == "uniform":
            conv_out_channels += 1
        elif latent_log_var == "constant":
            conv_out_channels += 1
        elif latent_log_var != "none":
            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
        self.conv_out = make_conv_nd(
-            dims,
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
            output_channel,
            conv_out_channels,
            3,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.gradient_checkpointing = False
@@ -242,15 +197,6 @@ class Encoder(nn.Module):
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            else:
                raise ValueError(f"Invalid input shape: {sample.shape}")
        elif self.latent_log_var == "constant":
            sample = sample[:, :-1, ...]
            approx_ln_0 = (
                -30
            )  # this is the minimal clamp value in DiagonalGaussianDistribution objects
            sample = torch.cat(
                [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
                dim=1,
            )
        return sample
@@ -285,7 +231,7 @@ class Decoder(nn.Module):
        dims,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        layers_per_block: int = 2,
        norm_num_groups: int = 32,
@@ -293,7 +239,6 @@ class Decoder(nn.Module):
        norm_layer: str = "group_norm",
        causal: bool = True,
        timestep_conditioning: bool = False,
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -319,7 +264,6 @@ class Decoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.up_blocks = nn.ModuleList([])
@@ -339,7 +283,6 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "attn_res_x":
                block = UNetMidBlock3D(
@@ -351,7 +294,6 @@ class Decoder(nn.Module):
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
                    attention_head_dim=block_params["attention_head_dim"],
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
@@ -364,21 +306,14 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=False,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = DepthToSpaceUpsample(
-                    dims=dims,
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
                    in_channels=input_channel,
                    stride=(2, 1, 1),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = DepthToSpaceUpsample(
-                    dims=dims,
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
                    in_channels=input_channel,
                    stride=(1, 2, 2),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                output_channel = output_channel // block_params.get("multiplier", 1)
@@ -388,7 +323,6 @@ class Decoder(nn.Module):
                    stride=(2, 2, 2),
                    residual=block_params.get("residual", False),
                    out_channels_reduction_factor=block_params.get("multiplier", 1),
                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown layer: {block_name}")
@@ -406,13 +340,7 @@ class Decoder(nn.Module):
        self.conv_act = nn.SiLU()
        self.conv_out = make_conv_nd(
-            dims,
+            dims, output_channel, out_channels, 3, padding=1, causal=True
            output_channel,
            out_channels,
            3,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.gradient_checkpointing = False
@@ -505,12 +433,6 @@ class UNetMidBlock3D(nn.Module):
        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
        resnet_groups (`int`, *optional*, defaults to 32):
            The number of groups to use in the group normalization layers of the resnet blocks.
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        inject_noise (`bool`, *optional*, defaults to `False`):
            Whether to inject noise into the hidden states.
        timestep_conditioning (`bool`, *optional*, defaults to `False`):
            Whether to condition the hidden states on the timestep.
    Returns:
        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
@@ -529,7 +451,6 @@ class UNetMidBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        resnet_groups = (
@@ -555,17 +476,13 @@ class UNetMidBlock3D(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=inject_noise,
                    timestep_conditioning=timestep_conditioning,
                    spatial_padding_mode=spatial_padding_mode,
                )
                for _ in range(num_layers)
            ]
        )
    def forward(
-        self,
+        self, hidden_states: torch.FloatTensor, causal: bool = True, timestep: Optional[torch.Tensor] = None
        hidden_states: torch.FloatTensor,
        causal: bool = True,
        timestep: Optional[torch.Tensor] = None,
    ) -> torch.FloatTensor:
        timestep_embed = None
        if self.timestep_conditioning:
@@ -590,62 +507,9 @@ class UNetMidBlock3D(nn.Module):
        return hidden_states
 class SpaceToDepthDownsample(nn.Module):
    def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
        super().__init__()
        self.stride = stride
        self.group_size = in_channels * math.prod(stride) // out_channels
        self.conv = make_conv_nd(
            dims=dims,
            in_channels=in_channels,
            out_channels=out_channels // math.prod(stride),
            kernel_size=3,
            stride=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
    def forward(self, x, causal: bool = True):
        if self.stride[0] == 2:
            x = torch.cat(
                [x[:, :, :1, :, :], x], dim=2
            )  # duplicate first frames for padding
        # skip connection
        x_in = rearrange(
            x,
            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
            p1=self.stride[0],
            p2=self.stride[1],
            p3=self.stride[2],
        )
        x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
        x_in = x_in.mean(dim=2)
        # conv
        x = self.conv(x, causal=causal)
        x = rearrange(
            x,
            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
            p1=self.stride[0],
            p2=self.stride[1],
            p3=self.stride[2],
        )
        x = x + x_in
        return x
 class DepthToSpaceUpsample(nn.Module):
    def __init__(
-        self,
+        self, dims, in_channels, stride, residual=False, out_channels_reduction_factor=1
        dims,
        in_channels,
        stride,
        residual=False,
        out_channels_reduction_factor=1,
        spatial_padding_mode="zeros",
    ):
        super().__init__()
        self.stride = stride
@@ -659,7 +523,6 @@ class DepthToSpaceUpsample(nn.Module):
            kernel_size=3,
            stride=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.residual = residual
        self.out_channels_reduction_factor = out_channels_reduction_factor
@@ -695,7 +558,7 @@ class DepthToSpaceUpsample(nn.Module):
 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
        super().__init__()
-        self.norm = ops.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
    def forward(self, x):
        x = rearrange(x, "b c d h w -> b d h w c")
@@ -728,7 +591,6 @@ class ResnetBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.in_channels = in_channels
@@ -755,7 +617,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        if inject_noise:
@@ -780,7 +641,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        if inject_noise:
@@ -941,44 +801,9 @@ class processor(nn.Module):
        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)
 class VideoVAE(nn.Module):
-    def __init__(self, version=0, config=None):
+    def __init__(self, version=0):
        super().__init__()
        if config is None:
            config = self.guess_config(version)
        self.timestep_conditioning = config.get("timestep_conditioning", False)
        double_z = config.get("double_z", True)
        latent_log_var = config.get(
            "latent_log_var", "per_channel" if double_z else "none"
        )
        self.encoder = Encoder(
            dims=config["dims"],
            in_channels=config.get("in_channels", 3),
            out_channels=config["latent_channels"],
            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
            patch_size=config.get("patch_size", 1),
            latent_log_var=latent_log_var,
            norm_layer=config.get("norm_layer", "group_norm"),
            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
        )
        self.decoder = Decoder(
            dims=config["dims"],
            in_channels=config["latent_channels"],
            out_channels=config.get("out_channels", 3),
            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
            patch_size=config.get("patch_size", 1),
            norm_layer=config.get("norm_layer", "group_norm"),
            causal=config.get("causal_decoder", False),
            timestep_conditioning=self.timestep_conditioning,
            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
        )
        self.per_channel_statistics = processor()
    def guess_config(self, version):
        if version == 0:
            config = {
                "_class_name": "CausalVideoAutoencoder",
@@ -1005,7 +830,7 @@ class VideoVAE(nn.Module):
                "use_quant_conv": False,
                "causal_decoder": False,
            }
-        elif version == 1:
+        else:
            config = {
                "_class_name": "CausalVideoAutoencoder",
                "dims": 3,
@@ -1041,47 +866,37 @@ class VideoVAE(nn.Module):
                "causal_decoder": False,
                "timestep_conditioning": True,
            }
-        else:
+
-            config = {
+        double_z = config.get("double_z", True)
-                "_class_name": "CausalVideoAutoencoder",
+        latent_log_var = config.get(
-                "dims": 3,
+            "latent_log_var", "per_channel" if double_z else "none"
-                "in_channels": 3,
+        )
-                "out_channels": 3,
+
-                "latent_channels": 128,
+        self.encoder = Encoder(
-                "encoder_blocks": [
+            dims=config["dims"],
-                    ["res_x", {"num_layers": 4}],
+            in_channels=config.get("in_channels", 3),
-                    ["compress_space_res", {"multiplier": 2}],
+            out_channels=config["latent_channels"],
-                    ["res_x", {"num_layers": 6}],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
-                    ["compress_time_res", {"multiplier": 2}],
+            patch_size=config.get("patch_size", 1),
-                    ["res_x", {"num_layers": 6}],
+            latent_log_var=latent_log_var,
-                    ["compress_all_res", {"multiplier": 2}],
+            norm_layer=config.get("norm_layer", "group_norm"),
-                    ["res_x", {"num_layers": 2}],
+        )
-                    ["compress_all_res", {"multiplier": 2}],
+
-                    ["res_x", {"num_layers": 2}]
+        self.decoder = Decoder(
-                ],
+            dims=config["dims"],
-                "decoder_blocks": [
+            in_channels=config["latent_channels"],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+            out_channels=config.get("out_channels", 3),
-                    ["compress_all", {"residual": True, "multiplier": 2}],
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+            patch_size=config.get("patch_size", 1),
-                    ["compress_all", {"residual": True, "multiplier": 2}],
+            norm_layer=config.get("norm_layer", "group_norm"),
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+            causal=config.get("causal_decoder", False),
-                    ["compress_all", {"residual": True, "multiplier": 2}],
+            timestep_conditioning=config.get("timestep_conditioning", False),
-                    ["res_x", {"num_layers": 5, "inject_noise": False}]
+        )
-                ],
+
-                "scaling_factor": 1.0,
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
-                "norm_layer": "pixel_norm",
+        self.per_channel_statistics = processor()
                "patch_size": 4,
                "latent_log_var": "uniform",
                "use_quant_conv": False,
                "causal_decoder": False,
                "timestep_conditioning": True
            }
        return config
    def encode(self, x):
        frames_count = x.shape[2]
        if ((frames_count - 1) % 8) != 0:
            raise ValueError("Invalid number of frames: Encode input must have 1 + 8 * x frames (e.g., 1, 9, 17, ...). Please check your input.")
        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
        return self.per_channel_statistics.normalize(means)
--- a/comfy/ldm/lightricks/vae/conv_nd_factory.py
+++ b/comfy/ldm/lightricks/vae/conv_nd_factory.py
@@ -17,11 +17,7 @@ def make_conv_nd(
    groups=1,
    bias=True,
    causal=False,
    spatial_padding_mode="zeros",
    temporal_padding_mode="zeros",
 ):
    if not (spatial_padding_mode == temporal_padding_mode or causal):
        raise NotImplementedError("spatial and temporal padding modes must be equal")
    if dims == 2:
        return ops.Conv2d(
            in_channels=in_channels,
@@ -32,7 +28,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
            padding_mode=spatial_padding_mode,
        )
    elif dims == 3:
        if causal:
@@ -45,7 +40,6 @@ def make_conv_nd(
                dilation=dilation,
                groups=groups,
                bias=bias,
                spatial_padding_mode=spatial_padding_mode,
            )
        return ops.Conv3d(
            in_channels=in_channels,
@@ -56,7 +50,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
            padding_mode=spatial_padding_mode,
        )
    elif dims == (2, 1):
        return DualConv3d(
@@ -66,7 +59,6 @@ def make_conv_nd(
            stride=stride,
            padding=padding,
            bias=bias,
            padding_mode=spatial_padding_mode,
        )
    else:
        raise ValueError(f"unsupported dimensions: {dims}")
--- a/comfy/ldm/lightricks/vae/dual_conv3d.py
+++ b/comfy/ldm/lightricks/vae/dual_conv3d.py
@@ -18,13 +18,11 @@ class DualConv3d(nn.Module):
        dilation: Union[int, Tuple[int, int, int]] = 1,
        groups=1,
        bias=True,
        padding_mode="zeros",
    ):
        super(DualConv3d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.padding_mode = padding_mode
        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size, kernel_size)
@@ -110,7 +108,6 @@ class DualConv3d(nn.Module):
            self.padding1,
            self.dilation1,
            self.groups,
            padding_mode=self.padding_mode,
        )
        if skip_time_conv:
@@ -125,7 +122,6 @@ class DualConv3d(nn.Module):
            self.padding2,
            self.dilation2,
            self.groups,
            padding_mode=self.padding_mode,
        )
        return x
@@ -141,16 +137,7 @@ class DualConv3d(nn.Module):
        stride1 = (self.stride1[1], self.stride1[2])
        padding1 = (self.padding1[1], self.padding1[2])
        dilation1 = (self.dilation1[1], self.dilation1[2])
-        x = F.conv2d(
+        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)
            x,
            weight1,
            self.bias1,
            stride1,
            padding1,
            dilation1,
            self.groups,
            padding_mode=self.padding_mode,
        )
        _, _, h, w = x.shape
@@ -167,16 +154,7 @@ class DualConv3d(nn.Module):
        stride2 = self.stride2[0]
        padding2 = self.padding2[0]
        dilation2 = self.dilation2[0]
-        x = F.conv1d(
+        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
            x,
            weight2,
            self.bias2,
            stride2,
            padding2,
            dilation2,
            self.groups,
            padding_mode=self.padding_mode,
        )
        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)
        return x
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -8,7 +8,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 import comfy.ldm.common_dit
-from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, RMSNorm
 from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
@@ -64,8 +64,8 @@ class JointAttention(nn.Module):
        )
        if qk_norm:
-            self.q_norm = operation_settings.get("operations").RMSNorm(self.head_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+            self.q_norm = RMSNorm(self.head_dim, elementwise_affine=True, **operation_settings)
-            self.k_norm = operation_settings.get("operations").RMSNorm(self.head_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+            self.k_norm = RMSNorm(self.head_dim, elementwise_affine=True, **operation_settings)
        else:
            self.q_norm = self.k_norm = nn.Identity()
@@ -242,11 +242,11 @@ class JointTransformerBlock(nn.Module):
            operation_settings=operation_settings,
        )
        self.layer_id = layer_id
-        self.attention_norm1 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
-        self.ffn_norm1 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
-        self.attention_norm2 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
-        self.ffn_norm2 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
        self.modulation = modulation
        if modulation:
@@ -431,7 +431,7 @@ class NextDiT(nn.Module):
        self.t_embedder = TimestepEmbedder(min(dim, 1024), **operation_settings)
        self.cap_embedder = nn.Sequential(
-            operation_settings.get("operations").RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
+            RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, **operation_settings),
            operation_settings.get("operations").Linear(
                cap_feat_dim,
                dim,
@@ -457,7 +457,7 @@ class NextDiT(nn.Module):
                for layer_id in range(n_layers)
            ]
        )
-        self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.norm_final = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, operation_settings=operation_settings)
        assert (dim // n_heads) == sum(axes_dims)
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -20,18 +20,8 @@ if model_management.xformers_enabled():
 if model_management.sage_attention_enabled():
    try:
        from sageattention import sageattn
    except ModuleNotFoundError as e:
        if e.name == "sageattention":
            logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention")
        else:
            raise e
        exit(-1)
 if model_management.flash_attention_enabled():
    try:
        from flash_attn import flash_attn_func
    except ModuleNotFoundError:
-        logging.error(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn")
+        logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention")
        exit(-1)
 from comfy.cli_args import args
@@ -474,7 +464,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
 def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
    if skip_reshape:
        b, _, _, dim_head = q.shape
-        tensor_layout = "HND"
+        tensor_layout="HND"
    else:
        b, _, dim_head = q.shape
        dim_head //= heads
@@ -482,7 +472,7 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
            lambda t: t.view(b, -1, heads, dim_head),
            (q, k, v),
        )
-        tensor_layout = "NHD"
+        tensor_layout="NHD"
    if mask is not None:
        # add a batch dimension if there isn't already one
@@ -492,17 +482,7 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
        if mask.ndim == 3:
            mask = mask.unsqueeze(1)
-    try:
+    out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
        out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
    except Exception as e:
        logging.error("Error running sage attention: {}, using pytorch attention instead.".format(e))
        if tensor_layout == "NHD":
            q, k, v = map(
                lambda t: t.transpose(1, 2),
                (q, k, v),
            )
        return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=True, skip_output_reshape=skip_output_reshape)
    if tensor_layout == "HND":
        if not skip_output_reshape:
            out = (
@@ -516,63 +496,6 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
    return out
 try:
    @torch.library.custom_op("flash_attention::flash_attn", mutates_args=())
    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
        return flash_attn_func(q, k, v, dropout_p=dropout_p, causal=causal)
    @flash_attn_wrapper.register_fake
    def flash_attn_fake(q, k, v, dropout_p=0.0, causal=False):
        # Output shape is the same as q
        return q.new_empty(q.shape)
 except AttributeError as error:
    FLASH_ATTN_ERROR = error
    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
        assert False, f"Could not define flash_attn_wrapper: {FLASH_ATTN_ERROR}"
 def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
    if skip_reshape:
        b, _, _, dim_head = q.shape
    else:
        b, _, dim_head = q.shape
        dim_head //= heads
        q, k, v = map(
            lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2),
            (q, k, v),
        )
    if mask is not None:
        # add a batch dimension if there isn't already one
        if mask.ndim == 2:
            mask = mask.unsqueeze(0)
        # add a heads dimension if there isn't already one
        if mask.ndim == 3:
            mask = mask.unsqueeze(1)
    try:
        assert mask is None
        out = flash_attn_wrapper(
            q.transpose(1, 2),
            k.transpose(1, 2),
            v.transpose(1, 2),
            dropout_p=0.0,
            causal=False,
        ).transpose(1, 2)
    except Exception as e:
        logging.warning(f"Flash Attention failed, using default SDPA: {e}")
        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
    if not skip_output_reshape:
        out = (
            out.transpose(1, 2).reshape(b, -1, heads * dim_head)
        )
    return out
 optimized_attention = attention_basic
 if model_management.sage_attention_enabled():
@@ -581,9 +504,6 @@ if model_management.sage_attention_enabled():
 elif model_management.xformers_enabled():
    logging.info("Using xformers attention")
    optimized_attention = attention_xformers
 elif model_management.flash_attention_enabled():
    logging.info("Using Flash Attention")
    optimized_attention = attention_flash
 elif model_management.pytorch_attention_enabled():
    logging.info("Using pytorch attention")
    optimized_attention = attention_pytorch
@@ -850,7 +770,6 @@ class SpatialTransformer(nn.Module):
        if not isinstance(context, list):
            context = [context] * len(self.transformer_blocks)
        b, c, h, w = x.shape
        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        x = self.norm(x)
        if not self.use_linear:
@@ -966,7 +885,6 @@ class SpatialVideoTransformer(SpatialTransformer):
        transformer_options={}
    ) -> torch.Tensor:
        _, _, h, w = x.shape
        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        spatial_context = None
        if exists(context):
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -9,6 +9,7 @@ from einops import repeat
 from comfy.ldm.modules.attention import optimized_attention
 from comfy.ldm.flux.layers import EmbedND
 from comfy.ldm.flux.math import apply_rope
 from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
 import comfy.ldm.common_dit
 import comfy.model_management
@@ -48,8 +49,8 @@ class WanSelfAttention(nn.Module):
        self.k = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        self.v = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        self.o = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.norm_q = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
+        self.norm_q = RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
-        self.norm_k = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
+        self.norm_k = RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
    def forward(self, x, freqs):
        r"""
@@ -82,7 +83,7 @@ class WanSelfAttention(nn.Module):
 class WanT2VCrossAttention(WanSelfAttention):
-    def forward(self, x, context, **kwargs):
+    def forward(self, x, context):
        r"""
        Args:
            x(Tensor): Shape [B, L1, C]
@@ -113,16 +114,16 @@ class WanI2VCrossAttention(WanSelfAttention):
        self.k_img = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        self.v_img = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        # self.alpha = nn.Parameter(torch.zeros((1, )))
-        self.norm_k_img = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
+        self.norm_k_img = RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
-    def forward(self, x, context, context_img_len):
+    def forward(self, x, context):
        r"""
        Args:
            x(Tensor): Shape [B, L1, C]
            context(Tensor): Shape [B, L2, C]
        """
-        context_img = context[:, :context_img_len]
+        context_img = context[:, :257]
-        context = context[:, context_img_len:]
+        context = context[:, 257:]
        # compute query, key, value
        q = self.norm_q(self.q(x))
@@ -192,7 +193,6 @@ class WanAttentionBlock(nn.Module):
        e,
        freqs,
        context,
        context_img_len=257,
    ):
        r"""
        Args:
@@ -213,94 +213,12 @@ class WanAttentionBlock(nn.Module):
        x = x + y * e[2]
        # cross-attention & ffn
-        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len)
+        x = x + self.cross_attn(self.norm3(x), context)
        y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3])
        x = x + y * e[5]
        return x
 class VaceWanAttentionBlock(WanAttentionBlock):
    def __init__(
            self,
            cross_attn_type,
            dim,
            ffn_dim,
            num_heads,
            window_size=(-1, -1),
            qk_norm=True,
            cross_attn_norm=False,
            eps=1e-6,
            block_id=0,
            operation_settings={}
    ):
        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
        self.block_id = block_id
        if block_id == 0:
            self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
    def forward(self, c, x, **kwargs):
        if self.block_id == 0:
            c = self.before_proj(c) + x
        c = super().forward(c, **kwargs)
        c_skip = self.after_proj(c)
        return c_skip, c
 class WanCamAdapter(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size, stride, num_residual_blocks=1, operation_settings={}):
        super(WanCamAdapter, self).__init__()
        # Pixel Unshuffle: reduce spatial dimensions by a factor of 8
        self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=8)
        # Convolution: reduce spatial dimensions by a factor
        #  of 2 (without overlap)
        self.conv = operation_settings.get("operations").Conv2d(in_dim * 64, out_dim, kernel_size=kernel_size, stride=stride, padding=0, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        # Residual blocks for feature extraction
        self.residual_blocks = nn.Sequential(
            *[WanCamResidualBlock(out_dim, operation_settings = operation_settings) for _ in range(num_residual_blocks)]
        )
    def forward(self, x):
        # Reshape to merge the frame dimension into batch
        bs, c, f, h, w = x.size()
        x = x.permute(0, 2, 1, 3, 4).contiguous().view(bs * f, c, h, w)
        # Pixel Unshuffle operation
        x_unshuffled = self.pixel_unshuffle(x)
        # Convolution operation
        x_conv = self.conv(x_unshuffled)
        # Feature extraction with residual blocks
        out = self.residual_blocks(x_conv)
        # Reshape to restore original bf dimension
        out = out.view(bs, f, out.size(1), out.size(2), out.size(3))
        # Permute dimensions to reorder (if needed), e.g., swap channels and feature frames
        out = out.permute(0, 2, 1, 3, 4)
        return out
 class WanCamResidualBlock(nn.Module):
    def __init__(self, dim, operation_settings={}):
        super(WanCamResidualBlock, self).__init__()
        self.conv1 = operation_settings.get("operations").Conv2d(dim, dim, kernel_size=3, padding=1, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = operation_settings.get("operations").Conv2d(dim, dim, kernel_size=3, padding=1, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
    def forward(self, x):
        residual = x
        out = self.relu(self.conv1(x))
        out = self.conv2(out)
        out += residual
        return out
 class Head(nn.Module):
    def __init__(self, dim, out_dim, patch_size, eps=1e-6, operation_settings={}):
@@ -332,7 +250,7 @@ class Head(nn.Module):
 class MLPProj(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, flf_pos_embed_token_number=None, operation_settings={}):
+    def __init__(self, in_dim, out_dim, operation_settings={}):
        super().__init__()
        self.proj = torch.nn.Sequential(
@@ -340,15 +258,7 @@ class MLPProj(torch.nn.Module):
            torch.nn.GELU(), operation_settings.get("operations").Linear(in_dim, out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
            operation_settings.get("operations").LayerNorm(out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
        if flf_pos_embed_token_number is not None:
            self.emb_pos = nn.Parameter(torch.empty((1, flf_pos_embed_token_number, in_dim), device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
        else:
            self.emb_pos = None
    def forward(self, image_embeds):
        if self.emb_pos is not None:
            image_embeds = image_embeds[:, :self.emb_pos.shape[1]] + comfy.model_management.cast_to(self.emb_pos[:, :image_embeds.shape[1]], dtype=image_embeds.dtype, device=image_embeds.device)
        clip_extra_context_tokens = self.proj(image_embeds)
        return clip_extra_context_tokens
@@ -374,7 +284,6 @@ class WanModel(torch.nn.Module):
                 qk_norm=True,
                 cross_attn_norm=True,
                 eps=1e-6,
                 flf_pos_embed_token_number=None,
                 image_model=None,
                 device=None,
                 dtype=None,
@@ -464,7 +373,7 @@ class WanModel(torch.nn.Module):
        self.rope_embedder = EmbedND(dim=d, theta=10000.0, axes_dim=[d - 4 * (d // 6), 2 * (d // 6), 2 * (d // 6)])
        if model_type == 'i2v':
-            self.img_emb = MLPProj(1280, dim, flf_pos_embed_token_number=flf_pos_embed_token_number, operation_settings=operation_settings)
+            self.img_emb = MLPProj(1280, dim, operation_settings=operation_settings)
        else:
            self.img_emb = None
@@ -475,8 +384,6 @@ class WanModel(torch.nn.Module):
        context,
        clip_fea=None,
        freqs=None,
        transformer_options={},
        **kwargs,
    ):
        r"""
        Forward pass through the diffusion model
@@ -512,25 +419,18 @@ class WanModel(torch.nn.Module):
        # context
        context = self.text_embedding(context)
-        context_img_len = None
+        if clip_fea is not None and self.img_emb is not None:
-        if clip_fea is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
-            if self.img_emb is not None:
+            context = torch.concat([context_clip, context], dim=1)
                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
                context = torch.concat([context_clip, context], dim=1)
            context_img_len = clip_fea.shape[-2]
-        patches_replace = transformer_options.get("patches_replace", {})
+        # arguments
-        blocks_replace = patches_replace.get("dit", {})
+        kwargs = dict(
-        for i, block in enumerate(self.blocks):
+            e=e0,
-            if ("double_block", i) in blocks_replace:
+            freqs=freqs,
-                def block_wrap(args):
+            context=context)
-                    out = {}
+
-                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
+        for block in self.blocks:
-                    return out
+            x = block(x, **kwargs)
                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
                x = out["img"]
            else:
                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
        # head
        x = self.head(x, e)
@@ -539,20 +439,13 @@ class WanModel(torch.nn.Module):
        x = self.unpatchify(x, grid_sizes)
        return x
-    def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, clip_fea=None, **kwargs):
        bs, c, t, h, w = x.shape
        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
        patch_size = self.patch_size
        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
        w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
        if time_dim_concat is not None:
            time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size)
            x = torch.cat([x, time_dim_concat], dim=2)
            t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0])
        img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
@@ -560,7 +453,7 @@ class WanModel(torch.nn.Module):
        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
        freqs = self.rope_embedder(img_ids).movedim(1, 2)
-        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]
+        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs)[:, :, :t, :h, :w]
    def unpatchify(self, x, grid_sizes):
        r"""
@@ -585,209 +478,3 @@ class WanModel(torch.nn.Module):
        u = torch.einsum('bfhwpqrc->bcfphqwr', u)
        u = u.reshape(b, c, *[i * j for i, j in zip(grid_sizes, self.patch_size)])
        return u
 class VaceWanModel(WanModel):
    r"""
    Wan diffusion backbone supporting both text-to-video and image-to-video.
    """
    def __init__(self,
                 model_type='vace',
                 patch_size=(1, 2, 2),
                 text_len=512,
                 in_dim=16,
                 dim=2048,
                 ffn_dim=8192,
                 freq_dim=256,
                 text_dim=4096,
                 out_dim=16,
                 num_heads=16,
                 num_layers=32,
                 window_size=(-1, -1),
                 qk_norm=True,
                 cross_attn_norm=True,
                 eps=1e-6,
                 flf_pos_embed_token_number=None,
                 image_model=None,
                 vace_layers=None,
                 vace_in_dim=None,
                 device=None,
                 dtype=None,
                 operations=None,
                 ):
        super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
        # Vace
        if vace_layers is not None:
            self.vace_layers = vace_layers
            self.vace_in_dim = vace_in_dim
            # vace blocks
            self.vace_blocks = nn.ModuleList([
                VaceWanAttentionBlock('t2v_cross_attn', self.dim, self.ffn_dim, self.num_heads, self.window_size, self.qk_norm, self.cross_attn_norm, self.eps, block_id=i, operation_settings=operation_settings)
                for i in range(self.vace_layers)
            ])
            self.vace_layers_mapping = {i: n for n, i in enumerate(range(0, self.num_layers, self.num_layers // self.vace_layers))}
            # vace patch embeddings
            self.vace_patch_embedding = operations.Conv3d(
                self.vace_in_dim, self.dim, kernel_size=self.patch_size, stride=self.patch_size, device=device, dtype=torch.float32
            )
    def forward_orig(
        self,
        x,
        t,
        context,
        vace_context,
        vace_strength,
        clip_fea=None,
        freqs=None,
        transformer_options={},
        **kwargs,
    ):
        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
        grid_sizes = x.shape[2:]
        x = x.flatten(2).transpose(1, 2)
        # time embeddings
        e = self.time_embedding(
            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
        # context
        context = self.text_embedding(context)
        context_img_len = None
        if clip_fea is not None:
            if self.img_emb is not None:
                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
                context = torch.concat([context_clip, context], dim=1)
            context_img_len = clip_fea.shape[-2]
        orig_shape = list(vace_context.shape)
        vace_context = vace_context.movedim(0, 1).reshape([-1] + orig_shape[2:])
        c = self.vace_patch_embedding(vace_context.float()).to(vace_context.dtype)
        c = c.flatten(2).transpose(1, 2)
        c = list(c.split(orig_shape[0], dim=0))
        # arguments
        x_orig = x
        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.blocks):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
                    return out
                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
                x = out["img"]
            else:
                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
            ii = self.vace_layers_mapping.get(i, None)
            if ii is not None:
                for iii in range(len(c)):
                    c_skip, c[iii] = self.vace_blocks[ii](c[iii], x=x_orig, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
                    x += c_skip * vace_strength[iii]
                del c_skip
        # head
        x = self.head(x, e)
        # unpatchify
        x = self.unpatchify(x, grid_sizes)
        return x
 class CameraWanModel(WanModel):
    r"""
    Wan diffusion backbone supporting both text-to-video and image-to-video.
    """
    def __init__(self,
                 model_type='camera',
                 patch_size=(1, 2, 2),
                 text_len=512,
                 in_dim=16,
                 dim=2048,
                 ffn_dim=8192,
                 freq_dim=256,
                 text_dim=4096,
                 out_dim=16,
                 num_heads=16,
                 num_layers=32,
                 window_size=(-1, -1),
                 qk_norm=True,
                 cross_attn_norm=True,
                 eps=1e-6,
                 flf_pos_embed_token_number=None,
                 image_model=None,
                 in_dim_control_adapter=24,
                 device=None,
                 dtype=None,
                 operations=None,
                 ):
        super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
        self.control_adapter = WanCamAdapter(in_dim_control_adapter, dim, kernel_size=patch_size[1:], stride=patch_size[1:], operation_settings=operation_settings)
    def forward_orig(
        self,
        x,
        t,
        context,
        clip_fea=None,
        freqs=None,
        camera_conditions = None,
        transformer_options={},
        **kwargs,
    ):
        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
        if self.control_adapter is not None and camera_conditions is not None:
            x_camera = self.control_adapter(camera_conditions).to(x.dtype)
            x = x + x_camera
        grid_sizes = x.shape[2:]
        x = x.flatten(2).transpose(1, 2)
        # time embeddings
        e = self.time_embedding(
            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
        # context
        context = self.text_embedding(context)
        context_img_len = None
        if clip_fea is not None:
            if self.img_emb is not None:
                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
                context = torch.concat([context_clip, context], dim=1)
            context_img_len = clip_fea.shape[-2]
        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.blocks):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
                    return out
                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
                x = out["img"]
            else:
                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
        # head
        x = self.head(x, e)
        # unpatchify
        x = self.unpatchify(x, grid_sizes)
        return x
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -20,7 +20,6 @@ from __future__ import annotations
 import comfy.utils
 import comfy.model_management
 import comfy.model_base
 import comfy.weight_adapter as weight_adapter
 import logging
 import torch
@@ -50,12 +49,139 @@ def load_lora(lora, to_load, log_missing=True):
            dora_scale = lora[dora_scale_name]
            loaded_keys.add(dora_scale_name)
-        for adapter_cls in weight_adapter.adapters:
+        reshape_name = "{}.reshape_weight".format(x)
-            adapter = adapter_cls.load(x, lora, alpha, dora_scale, loaded_keys)
+        reshape = None
-            if adapter is not None:
+        if reshape_name in lora.keys():
-                patch_dict[to_load[x]] = adapter
+            try:
-                loaded_keys.update(adapter.loaded_keys)
+                reshape = lora[reshape_name].tolist()
-                continue
+                loaded_keys.add(reshape_name)
            except:
                pass
        regular_lora = "{}.lora_up.weight".format(x)
        diffusers_lora = "{}_lora.up.weight".format(x)
        diffusers2_lora = "{}.lora_B.weight".format(x)
        diffusers3_lora = "{}.lora.up.weight".format(x)
        mochi_lora = "{}.lora_B".format(x)
        transformers_lora = "{}.lora_linear_layer.up.weight".format(x)
        A_name = None
        if regular_lora in lora.keys():
            A_name = regular_lora
            B_name = "{}.lora_down.weight".format(x)
            mid_name = "{}.lora_mid.weight".format(x)
        elif diffusers_lora in lora.keys():
            A_name = diffusers_lora
            B_name = "{}_lora.down.weight".format(x)
            mid_name = None
        elif diffusers2_lora in lora.keys():
            A_name = diffusers2_lora
            B_name = "{}.lora_A.weight".format(x)
            mid_name = None
        elif diffusers3_lora in lora.keys():
            A_name = diffusers3_lora
            B_name = "{}.lora.down.weight".format(x)
            mid_name = None
        elif mochi_lora in lora.keys():
            A_name = mochi_lora
            B_name = "{}.lora_A".format(x)
            mid_name = None
        elif transformers_lora in lora.keys():
            A_name = transformers_lora
            B_name ="{}.lora_linear_layer.down.weight".format(x)
            mid_name = None
        if A_name is not None:
            mid = None
            if mid_name is not None and mid_name in lora.keys():
                mid = lora[mid_name]
                loaded_keys.add(mid_name)
            patch_dict[to_load[x]] = ("lora", (lora[A_name], lora[B_name], alpha, mid, dora_scale, reshape))
            loaded_keys.add(A_name)
            loaded_keys.add(B_name)
        ######## loha
        hada_w1_a_name = "{}.hada_w1_a".format(x)
        hada_w1_b_name = "{}.hada_w1_b".format(x)
        hada_w2_a_name = "{}.hada_w2_a".format(x)
        hada_w2_b_name = "{}.hada_w2_b".format(x)
        hada_t1_name = "{}.hada_t1".format(x)
        hada_t2_name = "{}.hada_t2".format(x)
        if hada_w1_a_name in lora.keys():
            hada_t1 = None
            hada_t2 = None
            if hada_t1_name in lora.keys():
                hada_t1 = lora[hada_t1_name]
                hada_t2 = lora[hada_t2_name]
                loaded_keys.add(hada_t1_name)
                loaded_keys.add(hada_t2_name)
            patch_dict[to_load[x]] = ("loha", (lora[hada_w1_a_name], lora[hada_w1_b_name], alpha, lora[hada_w2_a_name], lora[hada_w2_b_name], hada_t1, hada_t2, dora_scale))
            loaded_keys.add(hada_w1_a_name)
            loaded_keys.add(hada_w1_b_name)
            loaded_keys.add(hada_w2_a_name)
            loaded_keys.add(hada_w2_b_name)
        ######## lokr
        lokr_w1_name = "{}.lokr_w1".format(x)
        lokr_w2_name = "{}.lokr_w2".format(x)
        lokr_w1_a_name = "{}.lokr_w1_a".format(x)
        lokr_w1_b_name = "{}.lokr_w1_b".format(x)
        lokr_t2_name = "{}.lokr_t2".format(x)
        lokr_w2_a_name = "{}.lokr_w2_a".format(x)
        lokr_w2_b_name = "{}.lokr_w2_b".format(x)
        lokr_w1 = None
        if lokr_w1_name in lora.keys():
            lokr_w1 = lora[lokr_w1_name]
            loaded_keys.add(lokr_w1_name)
        lokr_w2 = None
        if lokr_w2_name in lora.keys():
            lokr_w2 = lora[lokr_w2_name]
            loaded_keys.add(lokr_w2_name)
        lokr_w1_a = None
        if lokr_w1_a_name in lora.keys():
            lokr_w1_a = lora[lokr_w1_a_name]
            loaded_keys.add(lokr_w1_a_name)
        lokr_w1_b = None
        if lokr_w1_b_name in lora.keys():
            lokr_w1_b = lora[lokr_w1_b_name]
            loaded_keys.add(lokr_w1_b_name)
        lokr_w2_a = None
        if lokr_w2_a_name in lora.keys():
            lokr_w2_a = lora[lokr_w2_a_name]
            loaded_keys.add(lokr_w2_a_name)
        lokr_w2_b = None
        if lokr_w2_b_name in lora.keys():
            lokr_w2_b = lora[lokr_w2_b_name]
            loaded_keys.add(lokr_w2_b_name)
        lokr_t2 = None
        if lokr_t2_name in lora.keys():
            lokr_t2 = lora[lokr_t2_name]
            loaded_keys.add(lokr_t2_name)
        if (lokr_w1 is not None) or (lokr_w2 is not None) or (lokr_w1_a is not None) or (lokr_w2_a is not None):
            patch_dict[to_load[x]] = ("lokr", (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale))
        #glora
        a1_name = "{}.a1.weight".format(x)
        a2_name = "{}.a2.weight".format(x)
        b1_name = "{}.b1.weight".format(x)
        b2_name = "{}.b2.weight".format(x)
        if a1_name in lora:
            patch_dict[to_load[x]] = ("glora", (lora[a1_name], lora[a2_name], lora[b1_name], lora[b2_name], alpha, dora_scale))
            loaded_keys.add(a1_name)
            loaded_keys.add(a2_name)
            loaded_keys.add(b1_name)
            loaded_keys.add(b2_name)
        w_norm_name = "{}.w_norm".format(x)
        b_norm_name = "{}.b_norm".format(x)
@@ -279,23 +405,29 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["transformer.{}".format(key_lora)] = k
                key_map["diffusion_model.{}".format(key_lora)] = k  # Old loras
    if isinstance(model, comfy.model_base.HiDream):
        for k in sdk:
            if k.startswith("diffusion_model."):
                if k.endswith(".weight"):
                    key_lora = k[len("diffusion_model."):-len(".weight")]
                    key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format
                    key_map["transformer.{}".format(key_lora)] = k #SimpleTuner regular format
    if isinstance(model, comfy.model_base.ACEStep):
        for k in sdk:
            if k.startswith("diffusion_model.") and k.endswith(".weight"): #Official ACE step lora format
                key_lora = k[len("diffusion_model."):-len(".weight")]
                key_map["{}".format(key_lora)] = k
    return key_map
 def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function):
    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, intermediate_dtype)
    lora_diff *= alpha
    weight_calc = weight + function(lora_diff).type(weight.dtype)
    weight_norm = (
        weight_calc.transpose(0, 1)
        .reshape(weight_calc.shape[1], -1)
        .norm(dim=1, keepdim=True)
        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
        .transpose(0, 1)
    )
    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
    if strength != 1.0:
        weight_calc -= weight
        weight += strength * (weight_calc)
    else:
        weight[:] = weight_calc
    return weight
 def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Tensor:
    """
    Pad a tensor to a new shape with zeros.
@@ -350,16 +482,6 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
        if isinstance(v, list):
            v = (calculate_weight(v[1:], v[0][1](comfy.model_management.cast_to_device(v[0][0], weight.device, intermediate_dtype, copy=True), inplace=True), key, intermediate_dtype=intermediate_dtype), )
        if isinstance(v, weight_adapter.WeightAdapterBase):
            output = v.calculate_weight(weight, key, strength, strength_model, offset, function, intermediate_dtype, original_weights)
            if output is None:
                logging.warning("Calculate Weight Failed: {} {}".format(v.name, key))
            else:
                weight = output
                if old_weight is not None:
                    weight = old_weight
            continue
        if len(v) == 1:
            patch_type = "diff"
        elif len(v) == 2:
@@ -386,6 +508,157 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
            diff_weight = comfy.model_management.cast_to_device(target_weight, weight.device, intermediate_dtype) - \
                          comfy.model_management.cast_to_device(original_weights[key][0][0], weight.device, intermediate_dtype)
            weight += function(strength * comfy.model_management.cast_to_device(diff_weight, weight.device, weight.dtype))
        elif patch_type == "lora": #lora/locon
            mat1 = comfy.model_management.cast_to_device(v[0], weight.device, intermediate_dtype)
            mat2 = comfy.model_management.cast_to_device(v[1], weight.device, intermediate_dtype)
            dora_scale = v[4]
            reshape = v[5]
            if reshape is not None:
                weight = pad_tensor_to_shape(weight, reshape)
            if v[2] is not None:
                alpha = v[2] / mat2.shape[0]
            else:
                alpha = 1.0
            if v[3] is not None:
                #locon mid weights, hopefully the math is fine because I didn't properly test it
                mat3 = comfy.model_management.cast_to_device(v[3], weight.device, intermediate_dtype)
                final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
                mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
            try:
                lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        elif patch_type == "lokr":
            w1 = v[0]
            w2 = v[1]
            w1_a = v[3]
            w1_b = v[4]
            w2_a = v[5]
            w2_b = v[6]
            t2 = v[7]
            dora_scale = v[8]
            dim = None
            if w1 is None:
                dim = w1_b.shape[0]
                w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, intermediate_dtype),
                                comfy.model_management.cast_to_device(w1_b, weight.device, intermediate_dtype))
            else:
                w1 = comfy.model_management.cast_to_device(w1, weight.device, intermediate_dtype)
            if w2 is None:
                dim = w2_b.shape[0]
                if t2 is None:
                    w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype))
                else:
                    w2 = torch.einsum('i j k l, j r, i p -> p r k l',
                                        comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
                                        comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype),
                                        comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype))
            else:
                w2 = comfy.model_management.cast_to_device(w2, weight.device, intermediate_dtype)
            if len(w2.shape) == 4:
                w1 = w1.unsqueeze(2).unsqueeze(2)
            if v[2] is not None and dim is not None:
                alpha = v[2] / dim
            else:
                alpha = 1.0
            try:
                lora_diff = torch.kron(w1, w2).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        elif patch_type == "loha":
            w1a = v[0]
            w1b = v[1]
            if v[2] is not None:
                alpha = v[2] / w1b.shape[0]
            else:
                alpha = 1.0
            w2a = v[3]
            w2b = v[4]
            dora_scale = v[7]
            if v[5] is not None: #cp decomposition
                t1 = v[5]
                t2 = v[6]
                m1 = torch.einsum('i j k l, j r, i p -> p r k l',
                                    comfy.model_management.cast_to_device(t1, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype))
                m2 = torch.einsum('i j k l, j r, i p -> p r k l',
                                    comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype),
                                    comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype))
            else:
                m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype),
                                comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype))
                m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype),
                                comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype))
            try:
                lora_diff = (m1 * m2).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        elif patch_type == "glora":
            dora_scale = v[5]
            old_glora = False
            if v[3].shape[1] == v[2].shape[0] == v[0].shape[0] == v[1].shape[1]:
                rank = v[0].shape[0]
                old_glora = True
            if v[3].shape[0] == v[2].shape[1] == v[0].shape[1] == v[1].shape[0]:
                if old_glora and v[1].shape[0] == weight.shape[0] and weight.shape[0] == weight.shape[1]:
                    pass
                else:
                    old_glora = False
                    rank = v[1].shape[0]
            a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, intermediate_dtype)
            a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, intermediate_dtype)
            b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, intermediate_dtype)
            b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, intermediate_dtype)
            if v[4] is not None:
                alpha = v[4] / rank
            else:
                alpha = 1.0
            try:
                if old_glora:
                    lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2), a1)).reshape(weight.shape) #old lycoris glora
                else:
                    if weight.dim() > 2:
                        lora_diff = torch.einsum("o i ..., i j -> o j ...", torch.einsum("o i ..., i j -> o j ...", weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
                    else:
                        lora_diff = torch.mm(torch.mm(weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
                    lora_diff += torch.mm(b1, b2).reshape(weight.shape)
                if dora_scale is not None:
                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
                else:
                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
            except Exception as e:
                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        else:
            logging.warning("patch type not recognized {} {}".format(patch_type, key))
--- a/comfy/lora_convert.py
+++ b/comfy/lora_convert.py
@@ -1,5 +1,4 @@
 import torch
 import comfy.utils
 def convert_lora_bfl_control(sd): #BFL loras for Flux
@@ -12,13 +11,7 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux
    return sd_out
 def convert_lora_wan_fun(sd): #Wan Fun loras
    return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"})
 def convert_lora(sd):
    if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
        return convert_lora_bfl_control(sd)
    if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd:
        return convert_lora_wan_fun(sd)
    return sd
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -36,10 +36,6 @@ import comfy.ldm.hunyuan_video.model
 import comfy.ldm.cosmos.model
 import comfy.ldm.lumina.model
 import comfy.ldm.wan.model
 import comfy.ldm.hunyuan3d.model
 import comfy.ldm.hidream.model
 import comfy.ldm.chroma.model
 import comfy.ldm.ace.model
 import comfy.model_management
 import comfy.patcher_extension
@@ -62,7 +58,6 @@ class ModelType(Enum):
    FLOW = 6
    V_PREDICTION_CONTINUOUS = 7
    FLUX = 8
    IMG_TO_IMG = 9
 from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
@@ -93,8 +88,6 @@ def model_sampling(model_config, model_type):
    elif model_type == ModelType.FLUX:
        c = comfy.model_sampling.CONST
        s = comfy.model_sampling.ModelSamplingFlux
    elif model_type == ModelType.IMG_TO_IMG:
        c = comfy.model_sampling.IMG_TO_IMG
    class ModelSampling(s, c):
        pass
@@ -115,7 +108,7 @@ class BaseModel(torch.nn.Module):
        if not unet_config.get("disable_unet_model_creation", False):
            if model_config.custom_operations is None:
-                fp8 = model_config.optimizations.get("fp8", False)
+                fp8 = model_config.optimizations.get("fp8", model_config.scaled_fp8 is not None)
                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
            else:
                operations = model_config.custom_operations
@@ -135,7 +128,6 @@ class BaseModel(torch.nn.Module):
        logging.info("model_type {}".format(model_type.name))
        logging.debug("adm {}".format(self.adm_channels))
        self.memory_usage_factor = model_config.memory_usage_factor
        self.memory_usage_factor_conds = ()
    def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
@@ -147,7 +139,6 @@ class BaseModel(torch.nn.Module):
    def _apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        sigma = t
        xc = self.model_sampling.calculate_input(sigma, x)
        if c_concat is not None:
            xc = torch.cat([xc] + [c_concat], dim=1)
@@ -168,20 +159,11 @@ class BaseModel(torch.nn.Module):
            if hasattr(extra, "dtype"):
                if extra.dtype != torch.int and extra.dtype != torch.long:
                    extra = extra.to(dtype)
            if isinstance(extra, list):
                ex = []
                for ext in extra:
                    ex.append(ext.to(dtype))
                extra = ex
            extra_conds[o] = extra
        t = self.process_timestep(t, x=x, **extra_conds)
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
        return self.model_sampling.calculate_denoised(sigma, model_output, x)
    def process_timestep(self, timestep, **kwargs):
        return timestep
    def get_dtype(self):
        return self.diffusion_model.dtype
@@ -203,11 +185,6 @@ class BaseModel(torch.nn.Module):
            if concat_latent_image.shape[1:] != noise.shape[1:]:
                concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
                if noise.ndim == 5:
                    if concat_latent_image.shape[-3] < noise.shape[-3]:
                        concat_latent_image = torch.nn.functional.pad(concat_latent_image, (0, 0, 0, 0, 0, noise.shape[-3] - concat_latent_image.shape[-3]), "constant", 0)
                    else:
                        concat_latent_image = concat_latent_image[:, :, :noise.shape[-3]]
            concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])
@@ -236,11 +213,6 @@ class BaseModel(torch.nn.Module):
                        cond_concat.append(self.blank_inpaint_image_like(noise))
                    elif ck == "mask_inverted":
                        cond_concat.append(torch.zeros_like(noise)[:, :1])
                if ck == "concat_image":
                    if concat_latent_image is not None:
                        cond_concat.append(concat_latent_image.to(device))
                    else:
                        cond_concat.append(torch.zeros_like(noise))
            data = torch.cat(cond_concat, dim=1)
            return data
        return None
@@ -331,28 +303,19 @@ class BaseModel(torch.nn.Module):
    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
        return self.model_sampling.noise_scaling(sigma.reshape([sigma.shape[0]] + [1] * (len(noise.shape) - 1)), noise, latent_image)
-    def memory_required(self, input_shape, cond_shapes={}):
+    def memory_required(self, input_shape):
        input_shapes = [input_shape]
        for c in self.memory_usage_factor_conds:
            shape = cond_shapes.get(c, None)
            if shape is not None and len(shape) > 0:
                input_shapes += shape
        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
            dtype = self.get_dtype()
            if self.manual_cast_dtype is not None:
                dtype = self.manual_cast_dtype
            #TODO: this needs to be tweaked
-            area = sum(map(lambda input_shape: input_shape[0] * math.prod(input_shape[2:]), input_shapes))
+            area = input_shape[0] * math.prod(input_shape[2:])
            return (area * comfy.model_management.dtype_size(dtype) * 0.01 * self.memory_usage_factor) * (1024 * 1024)
        else:
            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
-            area = sum(map(lambda input_shape: input_shape[0] * math.prod(input_shape[2:]), input_shapes))
+            area = input_shape[0] * math.prod(input_shape[2:])
            return (area * 0.15 * self.memory_usage_factor) * (1024 * 1024)
    def extra_conds_shapes(self, **kwargs):
        return {}
 def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
    adm_inputs = []
@@ -623,19 +586,6 @@ class SDXL_instructpix2pix(IP2P, SDXL):
        else:
            self.process_ip2p_image_in = lambda image: image #diffusers ip2p
 class Lotus(BaseModel):
    def extra_conds(self, **kwargs):
        out = {}
        cross_attn = kwargs.get("cross_attn", None)
        out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
        device = kwargs["device"]
        task_emb = torch.tensor([1, 0]).float().to(device)
        task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)]).unsqueeze(0)
        out['y'] = comfy.conds.CONDRegular(task_emb)
        return out
    def __init__(self, model_config, model_type=ModelType.IMG_TO_IMG, device=None):
        super().__init__(model_config, model_type, device=device)
 class StableCascade_C(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
@@ -803,8 +753,8 @@ class PixArt(BaseModel):
        return out
 class Flux(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.flux.model.Flux):
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=unet_model)
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)
    def concat_cond(self, **kwargs):
        try:
@@ -895,26 +845,17 @@ class LTXV(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        guiding_latent = kwargs.get("guiding_latent", None)
        if guiding_latent is not None:
            out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
        guiding_latent_noise_scale = kwargs.get("guiding_latent_noise_scale", None)
        if guiding_latent_noise_scale is not None:
            out["guiding_latent_noise_scale"] = comfy.conds.CONDConstant(guiding_latent_noise_scale)
        out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
        if denoise_mask is not None:
            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
        keyframe_idxs = kwargs.get("keyframe_idxs", None)
        if keyframe_idxs is not None:
            out['keyframe_idxs'] = comfy.conds.CONDRegular(keyframe_idxs)
        return out
    def process_timestep(self, timestep, x, denoise_mask=None, **kwargs):
        if denoise_mask is None:
            return timestep
        return self.diffusion_model.patchifier.patchify(((denoise_mask) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1)))[:, :1])[0]
    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
        return latent_image
 class HunyuanVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo)
@@ -931,39 +872,20 @@ class HunyuanVideo(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        if image is not None:
            padding_shape = (noise.shape[0], 16, noise.shape[2] - 1, noise.shape[3], noise.shape[4])
            latent_padding = torch.zeros(padding_shape, device=noise.device, dtype=noise.dtype)
            image_latents = torch.cat([image.to(noise), latent_padding], dim=2)
            out['c_concat'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_latents))
        guidance = kwargs.get("guidance", 6.0)
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        guiding_frame_index = kwargs.get("guiding_frame_index", None)
        if guiding_frame_index is not None:
            out['guiding_frame_index'] = comfy.conds.CONDRegular(torch.FloatTensor([guiding_frame_index]))
        ref_latent = kwargs.get("ref_latent", None)
        if ref_latent is not None:
            out['ref_latent'] = comfy.conds.CONDRegular(self.process_latent_in(ref_latent))
        return out
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return latent_image
 class HunyuanVideoI2V(HunyuanVideo):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
        self.concat_keys = ("concat_image", "mask_inverted")
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
 class HunyuanVideoSkyreelsI2V(HunyuanVideo):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
        self.concat_keys = ("concat_image",)
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
 class CosmosVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EDM, image_to_video=False, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cosmos.model.GeneralDIT)
@@ -1013,42 +935,29 @@ class WAN21(BaseModel):
        self.image_to_video = image_to_video
    def concat_cond(self, **kwargs):
-        noise = kwargs.get("noise", None)
+        if not self.image_to_video:
        extra_channels = self.diffusion_model.patch_embedding.weight.shape[1] - noise.shape[1]
        if extra_channels == 0:
            return None
        image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        device = kwargs["device"]
        if image is None:
-            shape_image = list(noise.shape)
+            image = torch.zeros_like(noise)
            shape_image[1] = extra_channels
            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
        else:
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
            for i in range(0, image.shape[1], 16):
                image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16])
            image = utils.resize_to_batch_size(image, noise.shape[0])
-        if not self.image_to_video or extra_channels == image.shape[1]:
+        image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
-            return image
+        image = self.process_latent_in(image)
-
+        image = utils.resize_to_batch_size(image, noise.shape[0])
        if image.shape[1] > (extra_channels - 4):
            image = image[:, :(extra_channels - 4)]
        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
        if mask is None:
            mask = torch.zeros_like(noise)[:, :4]
        else:
-            if mask.shape[1] != 4:
+            mask = 1.0 - torch.mean(mask, dim=1, keepdim=True)
                mask = torch.mean(mask, dim=1, keepdim=True)
            mask = 1.0 - mask
            mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
            if mask.shape[-3] < noise.shape[-3]:
                mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
-            if mask.shape[1] == 1:
+            mask = mask.repeat(1, 4, 1, 1, 1)
                mask = mask.repeat(1, 4, 1, 1, 1)
            mask = utils.resize_to_batch_size(mask, noise.shape[0])
        return torch.cat((mask, image), dim=1)
@@ -1062,122 +971,4 @@ class WAN21(BaseModel):
        clip_vision_output = kwargs.get("clip_vision_output", None)
        if clip_vision_output is not None:
            out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.penultimate_hidden_states)
        time_dim_concat = kwargs.get("time_dim_concat", None)
        if time_dim_concat is not None:
            out['time_dim_concat'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_concat))
        return out
 class WAN21_Vace(WAN21):
    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.VaceWanModel)
        self.image_to_video = image_to_video
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        noise = kwargs.get("noise", None)
        noise_shape = list(noise.shape)
        vace_frames = kwargs.get("vace_frames", None)
        if vace_frames is None:
            noise_shape[1] = 32
            vace_frames = [torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype)]
        mask = kwargs.get("vace_mask", None)
        if mask is None:
            noise_shape[1] = 64
            mask = [torch.ones(noise_shape, device=noise.device, dtype=noise.dtype)] * len(vace_frames)
        vace_frames_out = []
        for j in range(len(vace_frames)):
            vf = vace_frames[j].clone()
            for i in range(0, vf.shape[1], 16):
                vf[:, i:i + 16] = self.process_latent_in(vf[:, i:i + 16])
            vf = torch.cat([vf, mask[j]], dim=1)
            vace_frames_out.append(vf)
        vace_frames = torch.stack(vace_frames_out, dim=1)
        out['vace_context'] = comfy.conds.CONDRegular(vace_frames)
        vace_strength = kwargs.get("vace_strength", [1.0] * len(vace_frames_out))
        out['vace_strength'] = comfy.conds.CONDConstant(vace_strength)
        return out
 class WAN21_Camera(WAN21):
    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.CameraWanModel)
        self.image_to_video = image_to_video
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        camera_conditions = kwargs.get("camera_conditions", None)
        if camera_conditions is not None:
            out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions)
        return out
 class Hunyuan3Dv2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        guidance = kwargs.get("guidance", 5.0)
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        return out
 class HiDream(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream.model.HiDreamImageTransformer2DModel)
    def encode_adm(self, **kwargs):
        return kwargs["pooled_output"]
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        conditioning_llama3 = kwargs.get("conditioning_llama3", None)
        if conditioning_llama3 is not None:
            out['encoder_hidden_states_llama3'] = comfy.conds.CONDRegular(conditioning_llama3)
        image_cond = kwargs.get("concat_latent_image", None)
        if image_cond is not None:
            out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond))
        return out
 class Chroma(Flux):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        guidance = kwargs.get("guidance", 0)
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        return out
 class ACEStep(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ace.model.ACEStepTransformer2DModel)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        noise = kwargs.get("noise", None)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        conditioning_lyrics = kwargs.get("conditioning_lyrics", None)
        if cross_attn is not None:
            out['lyric_token_idx'] = comfy.conds.CONDRegular(conditioning_lyrics)
        out['speaker_embeds'] = comfy.conds.CONDRegular(torch.zeros(noise.shape[0], 512, device=noise.device, dtype=noise.dtype))
        out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -1,4 +1,3 @@
 import json
 import comfy.supported_models
 import comfy.supported_models_base
 import comfy.utils
@@ -34,7 +33,7 @@ def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
        return last_transformer_depth, context_dim, use_linear_in_transformer, time_stack, time_stack_cross
    return None
-def detect_unet_config(state_dict, key_prefix, metadata=None):
+def detect_unet_config(state_dict, key_prefix):
    state_dict_keys = list(state_dict.keys())
    if '{}joint_blocks.0.context_block.attn.qkv.weight'.format(key_prefix) in state_dict_keys: #mmdit model
@@ -154,7 +153,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["guidance_embed"] = len(guidance_keys) > 0
        return dit_config
-    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and '{}img_in.weight'.format(key_prefix) in state_dict_keys: #Flux
+    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys: #Flux
        dit_config = {}
        dit_config["image_model"] = "flux"
        dit_config["in_channels"] = 16
@@ -164,9 +163,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        if in_key in state_dict_keys:
            dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size)
        dit_config["out_channels"] = 16
-        vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix)
+        dit_config["vec_in_dim"] = 768
        if vec_in_key in state_dict_keys:
            dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1]
        dit_config["context_in_dim"] = 4096
        dit_config["hidden_size"] = 3072
        dit_config["mlp_ratio"] = 4.0
@@ -176,16 +173,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["axes_dim"] = [16, 56, 56]
        dit_config["theta"] = 10000
        dit_config["qkv_bias"] = True
-        if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma
+        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
            dit_config["image_model"] = "chroma"
            dit_config["in_channels"] = 64
            dit_config["out_channels"] = 64
            dit_config["in_dim"] = 64
            dit_config["out_dim"] = 3072
            dit_config["hidden_dim"] = 5120
            dit_config["n_layers"] = 5
        else:
            dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
        return dit_config
    if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview
@@ -222,37 +210,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
    if '{}adaln_single.emb.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: #Lightricks ltxv
        dit_config = {}
        dit_config["image_model"] = "ltxv"
        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
        shape = state_dict['{}transformer_blocks.0.attn2.to_k.weight'.format(key_prefix)].shape
        dit_config["attention_head_dim"] = shape[0] // 32
        dit_config["cross_attention_dim"] = shape[1]
        if metadata is not None and "config" in metadata:
            dit_config.update(json.loads(metadata["config"]).get("transformer", {}))
        return dit_config
    if '{}genre_embedder.weight'.format(key_prefix) in state_dict_keys: #ACE-Step model
        dit_config = {}
        dit_config["audio_model"] = "ace"
        dit_config["attention_head_dim"] = 128
        dit_config["in_channels"] = 8
        dit_config["inner_dim"] = 2560
        dit_config["max_height"] = 16
        dit_config["max_position"] = 32768
        dit_config["max_width"] = 32768
        dit_config["mlp_ratio"] = 2.5
        dit_config["num_attention_heads"] = 20
        dit_config["num_layers"] = 24
        dit_config["out_channels"] = 8
        dit_config["patch_size"] = [16, 1]
        dit_config["rope_theta"] = 1000000.0
        dit_config["speaker_embedding_dim"] = 512
        dit_config["text_embedding_dim"] = 768
        dit_config["ssl_encoder_depths"] = [8, 8]
        dit_config["ssl_latent_dims"] = [1024, 768]
        dit_config["ssl_names"] = ["mert", "m-hubert"]
        dit_config["lyric_encoder_vocab_size"] = 6693
        dit_config["lyric_hidden_size"] = 1024
        return dit_config
    if '{}t_block.1.weight'.format(key_prefix) in state_dict_keys: # PixArt
@@ -357,54 +314,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["cross_attn_norm"] = True
        dit_config["eps"] = 1e-6
        dit_config["in_dim"] = state_dict['{}patch_embedding.weight'.format(key_prefix)].shape[1]
-        if '{}vace_patch_embedding.weight'.format(key_prefix) in state_dict_keys:
+        if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
-            dit_config["model_type"] = "vace"
+            dit_config["model_type"] = "i2v"
            dit_config["vace_in_dim"] = state_dict['{}vace_patch_embedding.weight'.format(key_prefix)].shape[1]
            dit_config["vace_layers"] = count_blocks(state_dict_keys, '{}vace_blocks.'.format(key_prefix) + '{}.')
        elif '{}control_adapter.conv.weight'.format(key_prefix) in state_dict_keys:
            dit_config["model_type"] = "camera"
        else:
-            if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
+            dit_config["model_type"] = "t2v"
                dit_config["model_type"] = "i2v"
            else:
                dit_config["model_type"] = "t2v"
        flf_weight = state_dict.get('{}img_emb.emb_pos'.format(key_prefix))
        if flf_weight is not None:
            dit_config["flf_pos_embed_token_number"] = flf_weight.shape[1]
        return dit_config
    if '{}latent_in.weight'.format(key_prefix) in state_dict_keys:  # Hunyuan 3D
        in_shape = state_dict['{}latent_in.weight'.format(key_prefix)].shape
        dit_config = {}
        dit_config["image_model"] = "hunyuan3d2"
        dit_config["in_channels"] = in_shape[1]
        dit_config["context_in_dim"] = state_dict['{}cond_in.weight'.format(key_prefix)].shape[1]
        dit_config["hidden_size"] = in_shape[0]
        dit_config["mlp_ratio"] = 4.0
        dit_config["num_heads"] = 16
        dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
        dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
        dit_config["qkv_bias"] = True
        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
        return dit_config
    if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys:  # HiDream
        dit_config = {}
        dit_config["image_model"] = "hidream"
        dit_config["attention_head_dim"] = 128
        dit_config["axes_dims_rope"] = [64, 32, 32]
        dit_config["caption_channels"] = [4096, 4096]
        dit_config["max_resolution"] = [128, 128]
        dit_config["in_channels"] = 16
        dit_config["llama_layers"] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31]
        dit_config["num_attention_heads"] = 20
        dit_config["num_routed_experts"] = 4
        dit_config["num_activated_experts"] = 2
        dit_config["num_layers"] = 16
        dit_config["num_single_layers"] = 32
        dit_config["out_channels"] = 16
        dit_config["patch_size"] = 2
        dit_config["text_emb_dim"] = 2048
        return dit_config
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
@@ -541,8 +454,8 @@ def model_config_from_unet_config(unet_config, state_dict=None):
    logging.error("no match {}".format(unet_config))
    return None
-def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False, metadata=None):
+def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False):
-    unet_config = detect_unet_config(state_dict, unet_key_prefix, metadata=metadata)
+    unet_config = detect_unet_config(state_dict, unet_key_prefix)
    if unet_config is None:
        return None
    model_config = model_config_from_unet_config(unet_config, state_dict)
@@ -555,10 +468,6 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
        model_config.scaled_fp8 = scaled_fp8_weight.dtype
        if model_config.scaled_fp8 == torch.float32:
            model_config.scaled_fp8 = torch.float8_e4m3fn
        if scaled_fp8_weight.nelement() == 2:
            model_config.optimizations["fp8"] = False
        else:
            model_config.optimizations["fp8"] = True
    return model_config
@@ -620,9 +529,6 @@ def convert_config(unet_config):
 def unet_config_from_diffusers_unet(state_dict, dtype=None):
    if "conv_in.weight" not in state_dict:
        return None
    match = {}
    transformer_depth = []
@@ -754,13 +660,8 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
            'use_temporal_attention': False, 'use_temporal_resblock': False}
    LotusD = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': 4,
            'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, 'num_heads': 8,
            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
            'use_temporal_attention': False, 'use_temporal_resblock': False}
-    supported_models = [LotusD, SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p, SD15_diffusers_inpaint]
+    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p, SD15_diffusers_inpaint]
    for unet_config in supported_models:
        matches = True
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -15,7 +15,6 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
 from __future__ import annotations
 import psutil
 import logging
@@ -27,10 +26,6 @@ import platform
 import weakref
 import gc
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from comfy.model_patcher import ModelPatcher
 class VRAMState(Enum):
    DISABLED = 0    #No vram present: no need to move models to vram
    NO_VRAM = 1     #Very low vram: enable all the options to save vram
@@ -51,32 +46,6 @@ cpu_state = CPUState.GPU
 total_vram = 0
 def get_supported_float8_types():
    float8_types = []
    try:
        float8_types.append(torch.float8_e4m3fn)
    except:
        pass
    try:
        float8_types.append(torch.float8_e4m3fnuz)
    except:
        pass
    try:
        float8_types.append(torch.float8_e5m2)
    except:
        pass
    try:
        float8_types.append(torch.float8_e5m2fnuz)
    except:
        pass
    try:
        float8_types.append(torch.float8_e8m0fnu)
    except:
        pass
    return float8_types
 FLOAT8_TYPES = get_supported_float8_types()
 xpu_available = False
 torch_version = ""
 try:
@@ -176,25 +145,6 @@ def get_torch_device():
        else:
            return torch.device(torch.cuda.current_device())
 def get_all_torch_devices(exclude_current=False):
    global cpu_state
    devices = []
    if cpu_state == CPUState.GPU:
        if is_nvidia():
            for i in range(torch.cuda.device_count()):
                devices.append(torch.device(i))
        elif is_intel_xpu():
            for i in range(torch.xpu.device_count()):
                devices.append(torch.device(i))
        elif is_ascend_npu():
            for i in range(torch.npu.device_count()):
                devices.append(torch.device(i))
    else:
        devices.append(get_torch_device())
    if exclude_current:
        devices.remove(get_torch_device())
    return devices
 def get_total_memory(dev=None, torch_total_too=False):
    global directml_enabled
    if dev is None:
@@ -236,21 +186,12 @@ def get_total_memory(dev=None, torch_total_too=False):
    else:
        return mem_total
 def mac_version():
    try:
        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
    except:
        return None
 total_vram = get_total_memory(get_torch_device()) / (1024 * 1024)
 total_ram = psutil.virtual_memory().total / (1024 * 1024)
 logging.info("Total VRAM {:0.0f} MB, total RAM {:0.0f} MB".format(total_vram, total_ram))
 try:
    logging.info("pytorch version: {}".format(torch_version))
    mac_ver = mac_version()
    if mac_ver is not None:
        logging.info("Mac Version {}".format(mac_ver))
 except:
    pass
@@ -321,16 +262,11 @@ except:
 try:
    if is_amd():
        try:
            rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
        except:
            rocm_version = (6, -1)
        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
        logging.info("AMD arch: {}".format(arch))
        logging.info("ROCm version: {}".format(rocm_version))
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
            if torch_version_numeric[0] >= 2 and torch_version_numeric[1] >= 7:  # works on 2.6 but doesn't actually seem to improve much
-                if any((a in arch) for a in ["gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches
+                if any((a in arch) for a in ["gfx1100", "gfx1101"]):  # TODO: more arches
                    ENABLE_PYTORCH_ATTENTION = True
 except:
    pass
@@ -411,13 +347,9 @@ try:
    logging.info("Device: {}".format(get_torch_device_name(get_torch_device())))
 except:
    logging.warning("Could not pick default device.")
 try:
    for device in get_all_torch_devices(exclude_current=True):
        logging.info("Device: {}".format(get_torch_device_name(device)))
 except:
    pass
-current_loaded_models: list[LoadedModel] = []
+
 current_loaded_models = []
 def module_size(module):
    module_mem = 0
@@ -428,7 +360,7 @@ def module_size(module):
    return module_mem
 class LoadedModel:
-    def __init__(self, model: ModelPatcher):
+    def __init__(self, model):
        self._set_model(model)
        self.device = model.load_device
        self.real_model = None
@@ -436,7 +368,7 @@ class LoadedModel:
        self.model_finalizer = None
        self._patcher_finalizer = None
-    def _set_model(self, model: ModelPatcher):
+    def _set_model(self, model):
        self._model = weakref.ref(model)
        if model.parent is not None:
            self._parent_model = weakref.ref(model.parent)
@@ -649,7 +581,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
            loaded_memory = loaded_model.model_loaded_memory()
            current_free_mem = get_free_memory(torch_dev) + loaded_memory
-            lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
+            lowvram_model_memory = max(64 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
            lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory)
        if vram_set_state == VRAMState.NO_VRAM:
@@ -728,7 +660,7 @@ def unet_inital_load_device(parameters, dtype):
        return torch_dev
    cpu_dev = torch.device("cpu")
-    if DISABLE_SMART_MEMORY or vram_state == VRAMState.NO_VRAM:
+    if DISABLE_SMART_MEMORY:
        return cpu_dev
    model_size = dtype_size(dtype) * parameters
@@ -758,12 +690,13 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
        return torch.float8_e4m3fn
    if args.fp8_e5m2_unet:
        return torch.float8_e5m2
    if args.fp8_e8m0fnu_unet:
        return torch.float8_e8m0fnu
    fp8_dtype = None
-    if weight_dtype in FLOAT8_TYPES:
+    try:
-        fp8_dtype = weight_dtype
+        if weight_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
            fp8_dtype = weight_dtype
    except:
        pass
    if fp8_dtype is not None:
        if supports_fp8_compute(device): #if fp8 compute is supported the casting is most likely not expensive
@@ -858,8 +791,6 @@ def text_encoder_dtype(device=None):
        return torch.float8_e5m2
    elif args.fp16_text_enc:
        return torch.float16
    elif args.bf16_text_enc:
        return torch.bfloat16
    elif args.fp32_text_enc:
        return torch.float32
@@ -972,61 +903,15 @@ def force_channels_last():
    #TODO
    return False
-
+def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False):
 STREAMS = {}
 NUM_STREAMS = 1
 if args.async_offload:
    NUM_STREAMS = 2
    logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS))
 stream_counters = {}
 def get_offload_stream(device):
    stream_counter = stream_counters.get(device, 0)
    if NUM_STREAMS <= 1:
        return None
    if device in STREAMS:
        ss = STREAMS[device]
        s = ss[stream_counter]
        stream_counter = (stream_counter + 1) % len(ss)
        if is_device_cuda(device):
            ss[stream_counter].wait_stream(torch.cuda.current_stream())
        stream_counters[device] = stream_counter
        return s
    elif is_device_cuda(device):
        ss = []
        for k in range(NUM_STREAMS):
            ss.append(torch.cuda.Stream(device=device, priority=0))
        STREAMS[device] = ss
        s = ss[stream_counter]
        stream_counter = (stream_counter + 1) % len(ss)
        stream_counters[device] = stream_counter
        return s
    return None
 def sync_stream(device, stream):
    if stream is None:
        return
    if is_device_cuda(device):
        torch.cuda.current_stream().wait_stream(stream)
 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
    if device is None or weight.device == device:
        if not copy:
            if dtype is None or weight.dtype == dtype:
                return weight
        if stream is not None:
            with stream:
                return weight.to(dtype=dtype, copy=copy)
        return weight.to(dtype=dtype, copy=copy)
-    if stream is not None:
+    r = torch.empty_like(weight, dtype=dtype, device=device)
-        with stream:
+    r.copy_(weight, non_blocking=non_blocking)
            r = torch.empty_like(weight, dtype=dtype, device=device)
            r.copy_(weight, non_blocking=non_blocking)
    else:
        r = torch.empty_like(weight, dtype=dtype, device=device)
        r.copy_(weight, non_blocking=non_blocking)
    return r
 def cast_to_device(tensor, device, dtype, copy=False):
@@ -1036,9 +921,6 @@ def cast_to_device(tensor, device, dtype, copy=False):
 def sage_attention_enabled():
    return args.use_sage_attention
 def flash_attention_enabled():
    return args.use_flash_attention
 def xformers_enabled():
    global directml_enabled
    global cpu_state
@@ -1087,6 +969,12 @@ def pytorch_attention_flash_attention():
            return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention
    return False
 def mac_version():
    try:
        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
    except:
        return None
 def force_upcast_attention_dtype():
    upcast = args.force_upcast_attention
@@ -1290,9 +1178,6 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    return False
 def supports_fp8_compute(device=None):
    if args.supports_fp8_compute:
        return True
    if not is_nvidia():
        return False
@@ -1321,41 +1206,13 @@ def soft_empty_cache(force=False):
        torch.xpu.empty_cache()
    elif is_ascend_npu():
        torch.npu.empty_cache()
    elif is_mlu():
        torch.mlu.empty_cache()
    elif torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
 def unload_all_models():
-    for device in get_all_torch_devices():
+    free_memory(1e30, get_torch_device())
        free_memory(1e30, device)
 def unload_model_and_clones(model: ModelPatcher, unload_additional_models=True, all_devices=False):
    'Unload only model and its clones - primarily for multigpu cloning purposes.'
    initial_keep_loaded: list[LoadedModel] = current_loaded_models.copy()
    additional_models = []
    if unload_additional_models:
        additional_models = model.get_nested_additional_models()
    keep_loaded = []
    for loaded_model in initial_keep_loaded:
        if loaded_model.model is not None:
            if model.clone_base_uuid == loaded_model.model.clone_base_uuid:
                continue
            # check additional models if they are a match
            skip = False
            for add_model in additional_models:
                if add_model.clone_base_uuid == loaded_model.model.clone_base_uuid:
                    skip = True
                    break
            if skip:
                continue
        keep_loaded.append(loaded_model)
    if not all_devices:
        free_memory(1e30, get_torch_device(), keep_loaded)
    else:
        for device in get_all_torch_devices():
            free_memory(1e30, device, keep_loaded)
 #TODO: might be cleaner to put this somewhere else
 import threading
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -84,15 +84,12 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_
 def create_model_options_clone(orig_model_options: dict):
    return comfy.patcher_extension.copy_nested_dicts(orig_model_options)
-def create_hook_patches_clone(orig_hook_patches, copy_tuples=False):
+def create_hook_patches_clone(orig_hook_patches):
    new_hook_patches = {}
    for hook_ref in orig_hook_patches:
        new_hook_patches[hook_ref] = {}
        for k in orig_hook_patches[hook_ref]:
            new_hook_patches[hook_ref][k] = orig_hook_patches[hook_ref][k][:]
            if copy_tuples:
                for i in range(len(new_hook_patches[hook_ref][k])):
                    new_hook_patches[hook_ref][k][i] = tuple(new_hook_patches[hook_ref][k][i])
    return new_hook_patches
 def wipe_lowvram_weight(m):
@@ -243,9 +240,6 @@ class ModelPatcher:
        self.is_clip = False
        self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed
        self.is_multigpu_base_clone = False
        self.clone_base_uuid = uuid.uuid4()
        if not hasattr(self.model, 'model_loaded_weight_memory'):
            self.model.model_loaded_weight_memory = 0
@@ -324,92 +318,18 @@ class ModelPatcher:
        n.is_clip = self.is_clip
        n.hook_mode = self.hook_mode
        n.is_multigpu_base_clone = self.is_multigpu_base_clone
        n.clone_base_uuid = self.clone_base_uuid
        for callback in self.get_all_callbacks(CallbacksMP.ON_CLONE):
            callback(self, n)
        return n
    def deepclone_multigpu(self, new_load_device=None, models_cache: dict[uuid.UUID,ModelPatcher]=None):
        logging.info(f"Creating deepclone of {self.model.__class__.__name__} for {new_load_device if new_load_device else self.load_device}.")
        comfy.model_management.unload_model_and_clones(self)
        n = self.clone()
        # set load device, if present
        if new_load_device is not None:
            n.load_device = new_load_device
        # unlike for normal clone, backup dicts that shared same ref should not;
        # otherwise, patchers that have deep copies of base models will erroneously influence each other.
        n.backup = copy.deepcopy(n.backup)
        n.object_patches_backup = copy.deepcopy(n.object_patches_backup)
        n.hook_backup = copy.deepcopy(n.hook_backup)
        n.model = copy.deepcopy(n.model)
        # multigpu clone should not have multigpu additional_models entry
        n.remove_additional_models("multigpu")
        # multigpu_clone all stored additional_models; make sure circular references are properly handled
        if models_cache is None:
            models_cache = {}
        for key, model_list in n.additional_models.items():
            for i in range(len(model_list)):
                add_model = n.additional_models[key][i]
                if add_model.clone_base_uuid not in models_cache:
                    models_cache[add_model.clone_base_uuid] = add_model.deepclone_multigpu(new_load_device=new_load_device, models_cache=models_cache)
                n.additional_models[key][i] = models_cache[add_model.clone_base_uuid]
        for callback in self.get_all_callbacks(CallbacksMP.ON_DEEPCLONE_MULTIGPU):
            callback(self, n)
        return n
    def match_multigpu_clones(self):
        multigpu_models = self.get_additional_models_with_key("multigpu")
        if len(multigpu_models) > 0:
            new_multigpu_models = []
            for mm in multigpu_models:
                # clone main model, but bring over relevant props from existing multigpu clone
                n = self.clone()
                n.load_device = mm.load_device
                n.backup = mm.backup
                n.object_patches_backup = mm.object_patches_backup
                n.hook_backup = mm.hook_backup
                n.model = mm.model
                n.is_multigpu_base_clone = mm.is_multigpu_base_clone
                n.remove_additional_models("multigpu")
                orig_additional_models: dict[str, list[ModelPatcher]] = comfy.patcher_extension.copy_nested_dicts(n.additional_models)
                n.additional_models = comfy.patcher_extension.copy_nested_dicts(mm.additional_models)
                # figure out which additional models are not present in multigpu clone
                models_cache = {}
                for mm_add_model in mm.get_additional_models():
                    models_cache[mm_add_model.clone_base_uuid] = mm_add_model
                remove_models_uuids = set(list(models_cache.keys()))
                for key, model_list in orig_additional_models.items():
                    for orig_add_model in model_list:
                        if orig_add_model.clone_base_uuid not in models_cache:
                            models_cache[orig_add_model.clone_base_uuid] = orig_add_model.deepclone_multigpu(new_load_device=n.load_device, models_cache=models_cache)
                            existing_list = n.get_additional_models_with_key(key)
                            existing_list.append(models_cache[orig_add_model.clone_base_uuid])
                            n.set_additional_models(key, existing_list)
                        if orig_add_model.clone_base_uuid in remove_models_uuids:
                            remove_models_uuids.remove(orig_add_model.clone_base_uuid)
                # remove duplicate additional models
                for key, model_list in n.additional_models.items():
                    new_model_list = [x for x in model_list if x.clone_base_uuid not in remove_models_uuids]
                    n.set_additional_models(key, new_model_list)
                for callback in self.get_all_callbacks(CallbacksMP.ON_MATCH_MULTIGPU_CLONES):
                    callback(self, n)
                new_multigpu_models.append(n)
            self.set_additional_models("multigpu", new_multigpu_models)
    def is_clone(self, other):
        if hasattr(other, 'model') and self.model is other.model:
            return True
        return False
-    def clone_has_same_weights(self, clone: ModelPatcher, allow_multigpu=False):
+    def clone_has_same_weights(self, clone: 'ModelPatcher'):
-        if allow_multigpu:
+        if not self.is_clone(clone):
-            if self.clone_base_uuid != clone.clone_base_uuid:
+            return False
                return False
        else:
            if not self.is_clone(clone):
                return False
        if self.current_hooks != clone.current_hooks:
            return False
@@ -827,7 +747,6 @@ class ModelPatcher:
    def partially_unload(self, device_to, memory_to_free=0):
        with self.use_ejected():
            hooks_unpatched = False
            memory_freed = 0
            patch_counter = 0
            unload_list = self._load_list()
@@ -851,10 +770,6 @@ class ModelPatcher:
                                move_weight = False
                                break
                            if not hooks_unpatched:
                                self.unpatch_hooks()
                                hooks_unpatched = True
                            if bk.inplace_update:
                                comfy.utils.copy_to_param(self.model, key, bk.weight)
                            else:
@@ -1009,7 +924,7 @@ class ModelPatcher:
        return self.additional_models.get(key, [])
    def get_additional_models(self):
-        all_models: list[ModelPatcher] = []
+        all_models = []
        for models in self.additional_models.values():
            all_models.extend(models)
        return all_models
@@ -1063,13 +978,9 @@ class ModelPatcher:
        for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN):
            callback(self)
-    def prepare_state(self, timestep, model_options, ignore_multigpu=False):
+    def prepare_state(self, timestep):
        for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE):
-            callback(self, timestep, model_options, ignore_multigpu)
+            callback(self, timestep)
        if not ignore_multigpu and "multigpu_clones" in model_options:
            for p in model_options["multigpu_clones"].values():
                p: ModelPatcher
                p.prepare_state(timestep, model_options, ignore_multigpu=True)
    def restore_hook_patches(self):
        if self.hook_patches_backup is not None:
@@ -1082,18 +993,12 @@ class ModelPatcher:
    def prepare_hook_patches_current_keyframe(self, t: torch.Tensor, hook_group: comfy.hooks.HookGroup, model_options: dict[str]):
        curr_t = t[0]
        reset_current_hooks = False
        multigpu_kf_changed_cache = None
        transformer_options = model_options.get("transformer_options", {})
        for hook in hook_group.hooks:
            changed = hook.hook_keyframe.prepare_current_keyframe(curr_t=curr_t, transformer_options=transformer_options)
            # if keyframe changed, remove any cached HookGroups that contain hook with the same hook_ref;
            # this will cause the weights to be recalculated when sampling
            if changed:
                # cache changed for multigpu usage
                if "multigpu_clones" in model_options:
                    if multigpu_kf_changed_cache is None:
                        multigpu_kf_changed_cache = []
                    multigpu_kf_changed_cache.append(hook)
                # reset current_hooks if contains hook that changed
                if self.current_hooks is not None:
                    for current_hook in self.current_hooks.hooks:
@@ -1105,28 +1010,6 @@ class ModelPatcher:
                        self.cached_hook_patches.pop(cached_group)
        if reset_current_hooks:
            self.patch_hooks(None)
        if "multigpu_clones" in model_options:
            for p in model_options["multigpu_clones"].values():
                p: ModelPatcher
                p._handle_changed_hook_keyframes(multigpu_kf_changed_cache)
    def _handle_changed_hook_keyframes(self, kf_changed_cache: list[comfy.hooks.Hook]):
        'Used to handle multigpu behavior inside prepare_hook_patches_current_keyframe.'
        if kf_changed_cache is None:
            return
        reset_current_hooks = False
        # reset current_hooks if contains hook that changed
        for hook in kf_changed_cache:
            if self.current_hooks is not None:
                for current_hook in self.current_hooks.hooks:
                    if current_hook == hook:
                        reset_current_hooks = True
                        break
            for cached_group in list(self.cached_hook_patches.keys()):
                if cached_group.contains(hook):
                    self.cached_hook_patches.pop(cached_group)
        if reset_current_hooks:
            self.patch_hooks(None)
    def register_all_hook_patches(self, hooks: comfy.hooks.HookGroup, target_dict: dict[str], model_options: dict=None,
                                  registered: comfy.hooks.HookGroup = None):
@@ -1206,6 +1089,7 @@ class ModelPatcher:
    def patch_hooks(self, hooks: comfy.hooks.HookGroup):
        with self.use_ejected():
            self.unpatch_hooks()
            if hooks is not None:
                model_sd_keys = list(self.model_state_dict().keys())
                memory_counter = None
@@ -1216,16 +1100,12 @@ class ModelPatcher:
                # if have cached weights for hooks, use it
                cached_weights = self.cached_hook_patches.get(hooks, None)
                if cached_weights is not None:
                    model_sd_keys_set = set(model_sd_keys)
                    for key in cached_weights:
                        if key not in model_sd_keys:
                            logging.warning(f"Cached hook could not patch. Key does not exist in model: {key}")
                            continue
                        self.patch_cached_hook_weights(cached_weights=cached_weights, key=key, memory_counter=memory_counter)
                        model_sd_keys_set.remove(key)
                    self.unpatch_hooks(model_sd_keys_set)
                else:
                    self.unpatch_hooks()
                    relevant_patches = self.get_combined_hook_patches(hooks=hooks)
                    original_weights = None
                    if len(relevant_patches) > 0:
@@ -1236,8 +1116,6 @@ class ModelPatcher:
                            continue
                        self.patch_hook_weight_to_device(hooks=hooks, combined_patches=relevant_patches, key=key, original_weights=original_weights,
                                                            memory_counter=memory_counter)
            else:
                self.unpatch_hooks()
            self.current_hooks = hooks
    def patch_cached_hook_weights(self, cached_weights: dict, key: str, memory_counter: MemoryCounter):
@@ -1294,23 +1172,17 @@ class ModelPatcher:
        del out_weight
        del weight
-    def unpatch_hooks(self, whitelist_keys_set: set[str]=None) -> None:
+    def unpatch_hooks(self) -> None:
        with self.use_ejected():
            if len(self.hook_backup) == 0:
                self.current_hooks = None
                return
            keys = list(self.hook_backup.keys())
-            if whitelist_keys_set:
+            for k in keys:
-                for k in keys:
+                comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
                    if k in whitelist_keys_set:
                        comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
                        self.hook_backup.pop(k)
            else:
                for k in keys:
                    comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
-                self.hook_backup.clear()
+            self.hook_backup.clear()
-                self.current_hooks = None
+            self.current_hooks = None
    def clean_hooks(self):
        self.unpatch_hooks()
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@@ -69,15 +69,6 @@ class CONST:
        sigma = sigma.view(sigma.shape[:1] + (1,) * (latent.ndim - 1))
        return latent / (1.0 - sigma)
 class X0(EPS):
    def calculate_denoised(self, sigma, model_output, model_input):
        return model_output
 class IMG_TO_IMG(X0):
    def calculate_input(self, sigma, noise):
        return noise
 class ModelSamplingDiscrete(torch.nn.Module):
    def __init__(self, model_config=None, zsnr=None):
        super().__init__()
@@ -111,14 +102,13 @@ class ModelSamplingDiscrete(torch.nn.Module):
        self.num_timesteps = int(timesteps)
        self.linear_start = linear_start
        self.linear_end = linear_end
        self.zsnr = zsnr
        # self.register_buffer('betas', torch.tensor(betas, dtype=torch.float32))
        # self.register_buffer('alphas_cumprod', torch.tensor(alphas_cumprod, dtype=torch.float32))
        # self.register_buffer('alphas_cumprod_prev', torch.tensor(alphas_cumprod_prev, dtype=torch.float32))
        sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
-        if self.zsnr:
+        if zsnr:
            sigmas = rescale_zero_terminal_snr_sigmas(sigmas)
        self.set_sigmas(sigmas)
--- a/comfy/multigpu.py
+++ b/comfy/multigpu.py
@@ -1,167 +0,0 @@
 from __future__ import annotations
 import torch
 import logging
 from collections import namedtuple
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from comfy.model_patcher import ModelPatcher
 import comfy.utils
 import comfy.patcher_extension
 import comfy.model_management
 class GPUOptions:
    def __init__(self, device_index: int, relative_speed: float):
        self.device_index = device_index
        self.relative_speed = relative_speed
    def clone(self):
        return GPUOptions(self.device_index, self.relative_speed)
    def create_dict(self):
        return {
            "relative_speed": self.relative_speed
        }
 class GPUOptionsGroup:
    def __init__(self):
        self.options: dict[int, GPUOptions] = {}
    def add(self, info: GPUOptions):
        self.options[info.device_index] = info
    def clone(self):
        c = GPUOptionsGroup()
        for opt in self.options.values():
            c.add(opt)
        return c
    def register(self, model: ModelPatcher):
        opts_dict = {}
        # get devices that are valid for this model
        devices: list[torch.device] = [model.load_device]
        for extra_model in model.get_additional_models_with_key("multigpu"):
            extra_model: ModelPatcher
            devices.append(extra_model.load_device)
        # create dictionary with actual device mapped to its GPUOptions
        device_opts_list: list[GPUOptions] = []
        for device in devices:
            device_opts = self.options.get(device.index, GPUOptions(device_index=device.index, relative_speed=1.0))
            opts_dict[device] = device_opts.create_dict()
            device_opts_list.append(device_opts)
        # make relative_speed relative to 1.0
        min_speed = min([x.relative_speed for x in device_opts_list])
        for value in opts_dict.values():
            value['relative_speed'] /= min_speed
        model.model_options['multigpu_options'] = opts_dict
 def create_multigpu_deepclones(model: ModelPatcher, max_gpus: int, gpu_options: GPUOptionsGroup=None, reuse_loaded=False):
    'Prepare ModelPatcher to contain deepclones of its BaseModel and related properties.'
    model = model.clone()
    # check if multigpu is already prepared - get the load devices from them if possible to exclude
    skip_devices = set()
    multigpu_models = model.get_additional_models_with_key("multigpu")
    if len(multigpu_models) > 0:
        for mm in multigpu_models:
            skip_devices.add(mm.load_device)
    skip_devices = list(skip_devices)
    full_extra_devices = comfy.model_management.get_all_torch_devices(exclude_current=True)
    limit_extra_devices = full_extra_devices[:max_gpus-1]
    extra_devices = limit_extra_devices.copy()
    # exclude skipped devices
    for skip in skip_devices:
        if skip in extra_devices:
            extra_devices.remove(skip)
    # create new deepclones
    if len(extra_devices) > 0:
        for device in extra_devices:
            device_patcher = None
            if reuse_loaded:
                # check if there are any ModelPatchers currently loaded that could be referenced here after a clone
                loaded_models: list[ModelPatcher] = comfy.model_management.loaded_models()
                for lm in loaded_models:
                    if lm.model is not None and lm.clone_base_uuid == model.clone_base_uuid and lm.load_device == device:
                        device_patcher = lm.clone()
                        logging.info(f"Reusing loaded deepclone of {device_patcher.model.__class__.__name__} for {device}")
                        break
            if device_patcher is None:
                device_patcher = model.deepclone_multigpu(new_load_device=device)
                device_patcher.is_multigpu_base_clone = True
            multigpu_models = model.get_additional_models_with_key("multigpu")
            multigpu_models.append(device_patcher)
            model.set_additional_models("multigpu", multigpu_models)
        model.match_multigpu_clones()
        if gpu_options is None:
            gpu_options = GPUOptionsGroup()
        gpu_options.register(model)
    else:
        logging.info("No extra torch devices need initialization, skipping initializing MultiGPU Work Units.")
    # TODO: only keep model clones that don't go 'past' the intended max_gpu count
    # multigpu_models = model.get_additional_models_with_key("multigpu")
    # new_multigpu_models = []
    # for m in multigpu_models:
    #     if m.load_device in limit_extra_devices:
    #         new_multigpu_models.append(m)
    # model.set_additional_models("multigpu", new_multigpu_models)
    # persist skip_devices for use in sampling code
    # if len(skip_devices) > 0 or "multigpu_skip_devices" in model.model_options:
    #     model.model_options["multigpu_skip_devices"] = skip_devices
    return model
 LoadBalance = namedtuple('LoadBalance', ['work_per_device', 'idle_time'])
 def load_balance_devices(model_options: dict[str], total_work: int, return_idle_time=False, work_normalized: int=None):
    'Optimize work assigned to different devices, accounting for their relative speeds and splittable work.'
    opts_dict = model_options['multigpu_options']
    devices = list(model_options['multigpu_clones'].keys())
    speed_per_device = []
    work_per_device = []
    # get sum of each device's relative_speed
    total_speed = 0.0
    for opts in opts_dict.values():
        total_speed += opts['relative_speed']
    # get relative work for each device;
    # obtained by w = (W*r)/R
    for device in devices:
        relative_speed = opts_dict[device]['relative_speed']
        relative_work = (total_work*relative_speed) / total_speed
        speed_per_device.append(relative_speed)
        work_per_device.append(relative_work)
    # relative work must be expressed in whole numbers, but likely is a decimal;
    # perform rounding while maintaining total sum equal to total work (sum of relative works)
    work_per_device = round_preserved(work_per_device)
    dict_work_per_device = {}
    for device, relative_work in zip(devices, work_per_device):
        dict_work_per_device[device] = relative_work
    if not return_idle_time:
        return LoadBalance(dict_work_per_device, None)
    # divide relative work by relative speed to get estimated completion time of said work by each device;
    # time here is relative and does not correspond to real-world units
    completion_time = [w/r for w,r in zip(work_per_device, speed_per_device)]
    # calculate relative time spent by the devices waiting on each other after their work is completed
    idle_time = abs(min(completion_time) - max(completion_time))
    # if need to compare work idle time, need to normalize to a common total work
    if work_normalized:
        idle_time *= (work_normalized/total_work)
    return LoadBalance(dict_work_per_device, idle_time)
 def round_preserved(values: list[float]):
    'Round all values in a list, preserving the combined sum of values.'
    # get floor of values; casting to int does it too
    floored = [int(x) for x in values]
    total_floored = sum(floored)
    # get remainder to distribute
    remainder = round(sum(values)) - total_floored
    # pair values with fractional portions
    fractional = [(i, x-floored[i]) for i, x in enumerate(values)]
    # sort by fractional part in descending order
    fractional.sort(key=lambda x: x[1], reverse=True)
    # distribute the remainder
    for i in range(remainder):
        index = fractional[i][0]
        floored[index] += 1
    return floored
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -17,12 +17,9 @@
 """
 import torch
 import logging
 import comfy.model_management
 from comfy.cli_args import args, PerformanceFeature
 import comfy.float
 import comfy.rmsnorm
 import contextlib
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references
@@ -38,31 +35,20 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
        if device is None:
            device = input.device
    offload_stream = comfy.model_management.get_offload_stream(device)
    if offload_stream is not None:
        wf_context = offload_stream
    else:
        wf_context = contextlib.nullcontext()
    bias = None
    non_blocking = comfy.model_management.device_supports_non_blocking(device)
    if s.bias is not None:
        has_function = len(s.bias_function) > 0
-        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
+        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function)
        if has_function:
-            with wf_context:
+            for f in s.bias_function:
-                for f in s.bias_function:
+                bias = f(bias)
                    bias = f(bias)
    has_function = len(s.weight_function) > 0
-    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
+    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function)
    if has_function:
-        with wf_context:
+        for f in s.weight_function:
-            for f in s.weight_function:
+            weight = f(weight)
                weight = f(weight)
    comfy.model_management.sync_stream(device, offload_stream)
    return weight, bias
 class CastWeightBiasOp:
@@ -159,25 +145,6 @@ class disable_weight_init:
            else:
                return super().forward(*args, **kwargs)
    class RMSNorm(comfy.rmsnorm.RMSNorm, CastWeightBiasOp):
        def reset_parameters(self):
            self.bias = None
            return None
        def forward_comfy_cast_weights(self, input):
            if self.weight is not None:
                weight, bias = cast_bias_weight(self, input)
            else:
                weight = None
            return comfy.rmsnorm.rms_norm(input, weight, self.eps)  # TODO: switch to commented out line when old torch is deprecated
            # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
        def forward(self, *args, **kwargs):
            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
    class ConvTranspose2d(torch.nn.ConvTranspose2d, CastWeightBiasOp):
        def reset_parameters(self):
            return None
@@ -275,9 +242,6 @@ class manual_cast(disable_weight_init):
    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
        comfy_cast_weights = True
    class RMSNorm(disable_weight_init.RMSNorm):
        comfy_cast_weights = True
    class Embedding(disable_weight_init.Embedding):
        comfy_cast_weights = True
@@ -308,10 +272,10 @@ def fp8_linear(self, input):
        if scale_input is None:
            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
            input = torch.clamp(input, min=-448, max=448, out=input)
-            input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
+            input = input.reshape(-1, input_shape[2]).to(dtype)
        else:
            scale_input = scale_input.to(input.device)
-            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous()
+            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype)
        if bias is not None:
            o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
@@ -344,7 +308,6 @@ class fp8_ops(manual_cast):
            return torch.nn.functional.linear(input, weight, bias)
 def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
    logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
    class scaled_fp8_op(manual_cast):
        class Linear(manual_cast.Linear):
            def __init__(self, *args, **kwargs):
@@ -392,29 +355,10 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None
    return scaled_fp8_op
 CUBLAS_IS_AVAILABLE = False
 try:
    from cublas_ops import CublasLinear
    CUBLAS_IS_AVAILABLE = True
 except ImportError:
    pass
 if CUBLAS_IS_AVAILABLE:
    class cublas_ops(disable_weight_init):
        class Linear(CublasLinear, disable_weight_init.Linear):
            def reset_parameters(self):
                return None
            def forward_comfy_cast_weights(self, input):
                return super().forward(input)
            def forward(self, *args, **kwargs):
                return super().forward(*args, **kwargs)
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None):
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
    if scaled_fp8 is not None:
-        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
+        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute, scale_input=True, override_dtype=scaled_fp8)
    if (
        fp8_compute and
@@ -423,15 +367,6 @@ def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_
    ):
        return fp8_ops
    if (
        PerformanceFeature.CublasOps in args.fast and
        CUBLAS_IS_AVAILABLE and
        weight_dtype == torch.float16 and
        (compute_dtype == torch.float16 or compute_dtype is None)
    ):
        logging.info("Using cublas ops")
        return cublas_ops
    if compute_dtype is None or weight_dtype == compute_dtype:
        return disable_weight_init
--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
@@ -3,8 +3,6 @@ from typing import Callable
 class CallbacksMP:
    ON_CLONE = "on_clone"
    ON_DEEPCLONE_MULTIGPU = "on_deepclone_multigpu"
    ON_MATCH_MULTIGPU_CLONES = "on_match_multigpu_clones"
    ON_LOAD = "on_load_after"
    ON_DETACH = "on_detach_after"
    ON_CLEANUP = "on_cleanup"
@@ -50,7 +48,6 @@ def get_all_callbacks(call_type: str, transformer_options: dict, is_model_option
 class WrappersMP:
    OUTER_SAMPLE = "outer_sample"
    PREPARE_SAMPLING = "prepare_sampling"
    SAMPLER_SAMPLE = "sampler_sample"
    CALC_COND_BATCH = "calc_cond_batch"
    APPLY_MODEL = "apply_model"
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
@@ -1,55 +0,0 @@
 import torch
 import comfy.model_management
 import numbers
 RMSNorm = None
 try:
    rms_norm_torch = torch.nn.functional.rms_norm
    RMSNorm = torch.nn.RMSNorm
 except:
    rms_norm_torch = None
 def rms_norm(x, weight=None, eps=1e-6):
    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
        if weight is None:
            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
        else:
            return rms_norm_torch(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
    else:
        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
        if weight is None:
            return r
        else:
            return r * comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device)
 if RMSNorm is None:
    class RMSNorm(torch.nn.Module):
        def __init__(
            self,
            normalized_shape,
            eps=1e-6,
            elementwise_affine=True,
            device=None,
            dtype=None,
        ):
            factory_kwargs = {"device": device, "dtype": dtype}
            super().__init__()
            if isinstance(normalized_shape, numbers.Integral):
                # mypy error: incompatible types in assignment
                normalized_shape = (normalized_shape,)  # type: ignore[assignment]
            self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
            self.eps = eps
            self.elementwise_affine = elementwise_affine
            if self.elementwise_affine:
                self.weight = torch.nn.Parameter(
                    torch.empty(self.normalized_shape, **factory_kwargs)
                )
            else:
                self.register_parameter("weight", None)
            self.bias = None
        def forward(self, x):
            return rms_norm(x, self.weight, self.eps)
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -1,11 +1,7 @@
 from __future__ import annotations
 import torch
 import uuid
 import math
 import collections
 import comfy.model_management
 import comfy.conds
 import comfy.model_patcher
 import comfy.utils
 import comfy.hooks
 import comfy.patcher_extension
@@ -108,78 +104,15 @@ def cleanup_additional_models(models):
        if hasattr(m, 'cleanup'):
            m.cleanup()
 def preprocess_multigpu_conds(conds: dict[str, list[dict[str]]], model: ModelPatcher, model_options: dict[str]):
    '''If multigpu acceleration required, creates deepclones of ControlNets and GLIGEN per device.'''
    multigpu_models: list[ModelPatcher] = model.get_additional_models_with_key("multigpu")
    if len(multigpu_models) == 0:
        return
    extra_devices = [x.load_device for x in multigpu_models]
    # handle controlnets
    controlnets: set[ControlBase] = set()
    for k in conds:
        for kk in conds[k]:
            if 'control' in kk:
                controlnets.add(kk['control'])
    if len(controlnets) > 0:
        # first, unload all controlnet clones
        for cnet in list(controlnets):
            cnet_models = cnet.get_models()
            for cm in cnet_models:
                comfy.model_management.unload_model_and_clones(cm, unload_additional_models=True)
        # next, make sure each controlnet has a deepclone for all relevant devices
        for cnet in controlnets:
            curr_cnet = cnet
            while curr_cnet is not None:
                for device in extra_devices:
                    if device not in curr_cnet.multigpu_clones:
                        curr_cnet.deepclone_multigpu(device, autoregister=True)
                curr_cnet = curr_cnet.previous_controlnet
        # since all device clones are now present, recreate the linked list for cloned cnets per device
        for cnet in controlnets:
            curr_cnet = cnet
            while curr_cnet is not None:
                prev_cnet = curr_cnet.previous_controlnet
                for device in extra_devices:
                    device_cnet = curr_cnet.get_instance_for_device(device)
                    prev_device_cnet = None
                    if prev_cnet is not None:
                        prev_device_cnet = prev_cnet.get_instance_for_device(device)
                    device_cnet.set_previous_controlnet(prev_device_cnet)
                curr_cnet = prev_cnet
    # potentially handle gligen - since not widely used, ignored for now
 def estimate_memory(model, noise_shape, conds):
    cond_shapes = collections.defaultdict(list)
    cond_shapes_min = {}
    for _, cs in conds.items():
        for cond in cs:
            for k, v in model.model.extra_conds_shapes(**cond).items():
                cond_shapes[k].append(v)
                if cond_shapes_min.get(k, None) is None:
                    cond_shapes_min[k] = [v]
                elif math.prod(v) > math.prod(cond_shapes_min[k][0]):
                    cond_shapes_min[k] = [v]
    memory_required = model.model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:]), cond_shapes=cond_shapes)
    minimum_memory_required = model.model.memory_required([noise_shape[0]] + list(noise_shape[1:]), cond_shapes=cond_shapes_min)
    return memory_required, minimum_memory_required
 def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
-    executor = comfy.patcher_extension.WrapperExecutor.new_executor(
+    real_model: BaseModel = None
        _prepare_sampling,
        comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
    )
    return executor.execute(model, noise_shape, conds, model_options=model_options)
 def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
    model.match_multigpu_clones()
    preprocess_multigpu_conds(conds, model, model_options)
    models, inference_memory = get_additional_models(conds, model.model_dtype())
    models += get_additional_models_from_model_options(model_options)
    models += model.get_nested_additional_models()  # TODO: does this require inference_memory update?
-    memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
+    memory_required = model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory
-    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory)
+    minimum_memory_required = model.memory_required([noise_shape[0]] + list(noise_shape[1:])) + inference_memory
    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required)
    real_model = model.model
    return real_model, conds, models
@@ -193,7 +126,7 @@ def cleanup_models(conds, models):
    cleanup_additional_models(set(control_cleanup))
-def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict):
+def prepare_model_patcher(model: 'ModelPatcher', conds, model_options: dict):
    '''
    Registers hooks from conds.
    '''
@@ -226,18 +159,3 @@ def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict):
        comfy.patcher_extension.merge_nested_dicts(to_load_options.setdefault(wc_name, {}), model_options["transformer_options"][wc_name],
                                                    copy_dict1=False)
    return to_load_options
 def prepare_model_patcher_multigpu_clones(model_patcher: ModelPatcher, loaded_models: list[ModelPatcher], model_options: dict):
    '''
    In case multigpu acceleration is enabled, prep ModelPatchers for each device.
    '''
    multigpu_patchers: list[ModelPatcher] = [x for x in loaded_models if x.is_multigpu_base_clone]
    if len(multigpu_patchers) > 0:
        multigpu_dict: dict[torch.device, ModelPatcher] = {}
        multigpu_dict[model_patcher.load_device] = model_patcher
        for x in multigpu_patchers:
            x.hook_patches = comfy.model_patcher.create_hook_patches_clone(model_patcher.hook_patches, copy_tuples=True)
            x.hook_mode = model_patcher.hook_mode # match main model's hook_mode
            multigpu_dict[x.load_device] = x
        model_options["multigpu_clones"] = multigpu_dict
    return multigpu_patchers
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -1,6 +1,4 @@
 from __future__ import annotations
 import comfy.model_management
 from .k_diffusion import sampling as k_diffusion_sampling
 from .extra_samplers import uni_pc
 from typing import TYPE_CHECKING, Callable, NamedTuple
@@ -20,13 +18,6 @@ import comfy.patcher_extension
 import comfy.hooks
 import scipy.stats
 import numpy
 import threading
 def add_area_dims(area, num_dims):
    while (len(area) // 2) < num_dims:
        area = [2147483648] + area[:len(area) // 2] + [0] + area[len(area) // 2:]
    return area
 def get_area_and_mult(conds, x_in, timestep_in):
    dims = tuple(x_in.shape[2:])
@@ -43,9 +34,8 @@ def get_area_and_mult(conds, x_in, timestep_in):
            return None
    if 'area' in conds:
        area = list(conds['area'])
-        area = add_area_dims(area, len(dims))
+        while (len(area) // 2) < len(dims):
-        if (len(area) // 2) > len(dims):
+            area = [2147483648] + area[:len(area) // 2] + [0] + area[len(area) // 2:]
            area = area[:len(dims)] + area[len(area) // 2:(len(area) // 2) + len(dims)]
    if 'strength' in conds:
        strength = conds['strength']
@@ -63,7 +53,7 @@ def get_area_and_mult(conds, x_in, timestep_in):
        if "mask_strength" in conds:
            mask_strength = conds["mask_strength"]
        mask = conds['mask']
-        assert (mask.shape[1:] == x_in.shape[2:])
+        assert(mask.shape[1:] == x_in.shape[2:])
        mask = mask[:input_x.shape[0]]
        if area is not None:
@@ -77,17 +67,16 @@ def get_area_and_mult(conds, x_in, timestep_in):
    mult = mask * strength
    if 'mask' not in conds and area is not None:
-        fuzz = 8
+        rr = 8
        for i in range(len(dims)):
            rr = min(fuzz, mult.shape[2 + i] // 4)
            if area[len(dims) + i] != 0:
                for t in range(rr):
                    m = mult.narrow(i + 2, t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))
            if (area[i] + area[len(dims) + i]) < x_in.shape[i + 2]:
                for t in range(rr):
                    m = mult.narrow(i + 2, area[i] - 1 - t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))
    conditioning = {}
    model_conds = conds["model_conds"]
@@ -143,7 +132,7 @@ def can_concat_cond(c1, c2):
    return cond_equal_size(c1.conditioning, c2.conditioning)
-def cond_cat(c_list, device=None):
+def cond_cat(c_list):
    temp = {}
    for x in c_list:
        for k in x:
@@ -155,8 +144,6 @@ def cond_cat(c_list, device=None):
    for k in temp:
        conds = temp[k]
        out[k] = conds[0].concat(conds[1:])
        if device is not None and hasattr(out[k], 'to'):
            out[k] = out[k].to(device)
    return out
@@ -210,9 +197,7 @@ def calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Ten
    )
    return executor.execute(model, conds, x_in, timestep, model_options)
-def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
+def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options):
    if 'multigpu_clones' in model_options:
        return _calc_cond_batch_multigpu(model, conds, x_in, timestep, model_options)
    out_conds = []
    out_counts = []
    # separate conds by matching hooks
@@ -244,7 +229,7 @@ def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Te
    if has_default_conds:
        finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options)
-    model.current_patcher.prepare_state(timestep, model_options)
+    model.current_patcher.prepare_state(timestep)
    # run every hooked_to_run separately
    for hooks, to_run in hooked_to_run.items():
@@ -263,13 +248,7 @@ def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Te
            for i in range(1, len(to_batch_temp) + 1):
                batch_amount = to_batch_temp[:len(to_batch_temp)//i]
                input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
-                cond_shapes = collections.defaultdict(list)
+                if model.memory_required(input_shape) * 1.5 < free_memory:
                for tt in batch_amount:
                    cond = {k: v.size() for k, v in to_run[tt][0].conditioning.items()}
                    for k, v in to_run[tt][0].conditioning.items():
                        cond_shapes[k].append(v.size())
                if model.memory_required(input_shape, cond_shapes=cond_shapes) * 1.5 < free_memory:
                    to_batch = batch_amount
                    break
@@ -352,190 +331,6 @@ def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Te
    return out_conds
 def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
    out_conds = []
    out_counts = []
    # separate conds by matching hooks
    hooked_to_run: dict[comfy.hooks.HookGroup,list[tuple[tuple,int]]] = {}
    default_conds = []
    has_default_conds = False
    output_device = x_in.device
    for i in range(len(conds)):
        out_conds.append(torch.zeros_like(x_in))
        out_counts.append(torch.ones_like(x_in) * 1e-37)
        cond = conds[i]
        default_c = []
        if cond is not None:
            for x in cond:
                if 'default' in x:
                    default_c.append(x)
                    has_default_conds = True
                    continue
                p = get_area_and_mult(x, x_in, timestep)
                if p is None:
                    continue
                if p.hooks is not None:
                    model.current_patcher.prepare_hook_patches_current_keyframe(timestep, p.hooks, model_options)
                hooked_to_run.setdefault(p.hooks, list())
                hooked_to_run[p.hooks] += [(p, i)]
        default_conds.append(default_c)
    if has_default_conds:
        finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options)
    model.current_patcher.prepare_state(timestep, model_options)
    devices = [dev_m for dev_m in model_options['multigpu_clones'].keys()]
    device_batched_hooked_to_run: dict[torch.device, list[tuple[comfy.hooks.HookGroup, tuple]]] = {}
    total_conds = 0
    for to_run in hooked_to_run.values():
        total_conds += len(to_run)
    conds_per_device = max(1, math.ceil(total_conds//len(devices)))
    index_device = 0
    current_device = devices[index_device]
    # run every hooked_to_run separately
    for hooks, to_run in hooked_to_run.items():
        while len(to_run) > 0:
            current_device = devices[index_device % len(devices)]
            batched_to_run = device_batched_hooked_to_run.setdefault(current_device, [])
            # keep track of conds currently scheduled onto this device
            batched_to_run_length = 0
            for btr in batched_to_run:
                batched_to_run_length += len(btr[1])
            first = to_run[0]
            first_shape = first[0][0].shape
            to_batch_temp = []
            # make sure not over conds_per_device limit when creating temp batch
            for x in range(len(to_run)):
                if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < (conds_per_device - batched_to_run_length):
                    to_batch_temp += [x]
            to_batch_temp.reverse()
            to_batch = to_batch_temp[:1]
            free_memory = model_management.get_free_memory(current_device)
            for i in range(1, len(to_batch_temp) + 1):
                batch_amount = to_batch_temp[:len(to_batch_temp)//i]
                input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
                if model.memory_required(input_shape) * 1.5 < free_memory:
                    to_batch = batch_amount
                    break
            conds_to_batch = []
            for x in to_batch:
                conds_to_batch.append(to_run.pop(x))
            batched_to_run_length += len(conds_to_batch)
            batched_to_run.append((hooks, conds_to_batch))
            if batched_to_run_length >= conds_per_device:
                index_device += 1
    thread_result = collections.namedtuple('thread_result', ['output', 'mult', 'area', 'batch_chunks', 'cond_or_uncond'])
    def _handle_batch(device: torch.device, batch_tuple: tuple[comfy.hooks.HookGroup, tuple], results: list[thread_result]):
        model_current: BaseModel = model_options["multigpu_clones"][device].model
        # run every hooked_to_run separately
        with torch.no_grad():
            for hooks, to_batch in batch_tuple:
                input_x = []
                mult = []
                c = []
                cond_or_uncond = []
                uuids = []
                area = []
                control: ControlBase = None
                patches = None
                for x in to_batch:
                    o = x
                    p = o[0]
                    input_x.append(p.input_x)
                    mult.append(p.mult)
                    c.append(p.conditioning)
                    area.append(p.area)
                    cond_or_uncond.append(o[1])
                    uuids.append(p.uuid)
                    control = p.control
                    patches = p.patches
                batch_chunks = len(cond_or_uncond)
                input_x = torch.cat(input_x).to(device)
                c = cond_cat(c, device=device)
                timestep_ = torch.cat([timestep.to(device)] * batch_chunks)
                transformer_options = model_current.current_patcher.apply_hooks(hooks=hooks)
                if 'transformer_options' in model_options:
                    transformer_options = comfy.patcher_extension.merge_nested_dicts(transformer_options,
                                                                                    model_options['transformer_options'],
                                                                                    copy_dict1=False)
                if patches is not None:
                    # TODO: replace with merge_nested_dicts function
                    if "patches" in transformer_options:
                        cur_patches = transformer_options["patches"].copy()
                        for p in patches:
                            if p in cur_patches:
                                cur_patches[p] = cur_patches[p] + patches[p]
                            else:
                                cur_patches[p] = patches[p]
                        transformer_options["patches"] = cur_patches
                    else:
                        transformer_options["patches"] = patches
                transformer_options["cond_or_uncond"] = cond_or_uncond[:]
                transformer_options["uuids"] = uuids[:]
                transformer_options["sigmas"] = timestep
                transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device)
                transformer_options["multigpu_thread_device"] = device
                cast_transformer_options(transformer_options, device=device)
                c['transformer_options'] = transformer_options
                if control is not None:
                    device_control = control.get_instance_for_device(device)
                    c['control'] = device_control.get_control(input_x, timestep_, c, len(cond_or_uncond), transformer_options)
                if 'model_function_wrapper' in model_options:
                    output = model_options['model_function_wrapper'](model_current.apply_model, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).to(output_device).chunk(batch_chunks)
                else:
                    output = model_current.apply_model(input_x, timestep_, **c).to(output_device).chunk(batch_chunks)
                results.append(thread_result(output, mult, area, batch_chunks, cond_or_uncond))
    results: list[thread_result] = []
    threads: list[threading.Thread] = []
    for device, batch_tuple in device_batched_hooked_to_run.items():
        new_thread = threading.Thread(target=_handle_batch, args=(device, batch_tuple, results))
        threads.append(new_thread)
        new_thread.start()
    for thread in threads:
        thread.join()
    for output, mult, area, batch_chunks, cond_or_uncond in results:
        for o in range(batch_chunks):
            cond_index = cond_or_uncond[o]
            a = area[o]
            if a is None:
                out_conds[cond_index] += output[o] * mult[o]
                out_counts[cond_index] += mult[o]
            else:
                out_c = out_conds[cond_index]
                out_cts = out_counts[cond_index]
                dims = len(a) // 2
                for i in range(dims):
                    out_c = out_c.narrow(i + 2, a[i + dims], a[i])
                    out_cts = out_cts.narrow(i + 2, a[i + dims], a[i])
                out_c += output[o] * mult[o]
                out_cts += mult[o]
    for i in range(len(out_conds)):
        out_conds[i] /= out_counts[i]
    return out_conds
 def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options): #TODO: remove
    logging.warning("WARNING: The comfy.samplers.calc_cond_uncond_batch function is deprecated please use the calc_cond_batch one instead.")
    return tuple(calc_cond_batch(model, [cond, uncond], x_in, timestep, model_options))
@@ -756,37 +551,25 @@ def resolve_areas_and_cond_masks(conditions, h, w, device):
    logging.warning("WARNING: The comfy.samplers.resolve_areas_and_cond_masks function is deprecated please use the resolve_areas_and_cond_masks_multidim one instead.")
    return resolve_areas_and_cond_masks_multidim(conditions, [h, w], device)
-def create_cond_with_same_area_if_none(conds, c):
+def create_cond_with_same_area_if_none(conds, c): #TODO: handle dim != 2
    if 'area' not in c:
        return
    def area_inside(a, area_cmp):
        a = add_area_dims(a, len(area_cmp) // 2)
        area_cmp = add_area_dims(area_cmp, len(a) // 2)
        a_l = len(a) // 2
        area_cmp_l = len(area_cmp) // 2
        for i in range(min(a_l, area_cmp_l)):
            if a[a_l + i] < area_cmp[area_cmp_l + i]:
                return False
        for i in range(min(a_l, area_cmp_l)):
            if (a[i] + a[a_l + i]) > (area_cmp[i] + area_cmp[area_cmp_l + i]):
                return False
        return True
    c_area = c['area']
    smallest = None
    for x in conds:
        if 'area' in x:
            a = x['area']
-            if area_inside(c_area, a):
+            if c_area[2] >= a[2] and c_area[3] >= a[3]:
-                if smallest is None:
+                if a[0] + a[2] >= c_area[0] + c_area[2]:
-                    smallest = x
+                    if a[1] + a[3] >= c_area[1] + c_area[3]:
-                elif 'area' not in smallest:
+                        if smallest is None:
-                    smallest = x
+                            smallest = x
-                else:
+                        elif 'area' not in smallest:
-                    if math.prod(smallest['area'][:len(smallest['area']) // 2]) > math.prod(a[:len(a) // 2]):
+                            smallest = x
-                        smallest = x
+                        else:
                            if smallest['area'][0] * smallest['area'][1] > a[0] * a[1]:
                                smallest = x
        else:
            if smallest is None:
                smallest = x
@@ -833,8 +616,6 @@ def pre_run_control(model, conds):
        percent_to_timestep_function = lambda a: s.percent_to_sigma(a)
        if 'control' in x:
            x['control'].pre_run(model, percent_to_timestep_function)
            for device_cnet in x['control'].multigpu_clones.values():
                device_cnet.pre_run(model, percent_to_timestep_function)
 def apply_empty_x_to_equal_area(conds, uncond, name, uncond_fill_func):
    cond_cnets = []
@@ -909,7 +690,7 @@ KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_c
                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
                  "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp",
-                  "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3"]
+                  "gradient_estimation"]
 class KSAMPLER(Sampler):
    def __init__(self, sampler_function, extra_options={}, inpaint_options={}):
@@ -1077,9 +858,7 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None):
    to_load_options = model_options.get("to_load_options", None)
    if to_load_options is None:
        return
    cast_transformer_options(to_load_options, device, dtype)
 def cast_transformer_options(transformer_options: dict[str], device=None, dtype=None):
    casts = []
    if device is not None:
        casts.append(device)
@@ -1088,17 +867,18 @@ def cast_transformer_options(transformer_options: dict[str], device=None, dtype=
    # if nothing to apply, do nothing
    if len(casts) == 0:
        return
    # try to call .to on patches
-    if "patches" in transformer_options:
+    if "patches" in to_load_options:
-        patches = transformer_options["patches"]
+        patches = to_load_options["patches"]
        for name in patches:
            patch_list = patches[name]
            for i in range(len(patch_list)):
                if hasattr(patch_list[i], "to"):
                    for cast in casts:
                        patch_list[i] = patch_list[i].to(cast)
-    if "patches_replace" in transformer_options:
+    if "patches_replace" in to_load_options:
-        patches = transformer_options["patches_replace"]
+        patches = to_load_options["patches_replace"]
        for name in patches:
            patch_list = patches[name]
            for k in patch_list:
@@ -1108,8 +888,8 @@ def cast_transformer_options(transformer_options: dict[str], device=None, dtype=
    # try to call .to on any wrappers/callbacks
    wrappers_and_callbacks = ["wrappers", "callbacks"]
    for wc_name in wrappers_and_callbacks:
-        if wc_name in transformer_options:
+        if wc_name in to_load_options:
-            wc: dict[str, list] = transformer_options[wc_name]
+            wc: dict[str, list] = to_load_options[wc_name]
            for wc_dict in wc.values():
                for wc_list in wc_dict.values():
                    for i in range(len(wc_list)):
@@ -1117,6 +897,7 @@ def cast_transformer_options(transformer_options: dict[str], device=None, dtype=
                            for cast in casts:
                                wc_list[i] = wc_list[i].to(cast)
 class CFGGuider:
    def __init__(self, model_patcher: ModelPatcher):
        self.model_patcher = model_patcher
@@ -1162,8 +943,6 @@ class CFGGuider:
        self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
        device = self.model_patcher.load_device
        multigpu_patchers = comfy.sampler_helpers.prepare_model_patcher_multigpu_clones(self.model_patcher, self.loaded_models, self.model_options)
        if denoise_mask is not None:
            denoise_mask = comfy.sampler_helpers.prepare_mask(denoise_mask, noise.shape, device)
@@ -1174,13 +953,9 @@ class CFGGuider:
        try:
            self.model_patcher.pre_run()
            for multigpu_patcher in multigpu_patchers:
                multigpu_patcher.pre_run()
            output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed)
        finally:
            self.model_patcher.cleanup()
            for multigpu_patcher in multigpu_patchers:
                multigpu_patcher.cleanup()
        comfy.sampler_helpers.cleanup_models(self.conds, self.loaded_models)
        del self.inner_model
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
 import json
 import torch
 from enum import Enum
 import logging
@@ -14,8 +13,6 @@ import comfy.ldm.genmo.vae.model
 import comfy.ldm.lightricks.vae.causal_video_autoencoder
 import comfy.ldm.cosmos.vae
 import comfy.ldm.wan.vae
 import comfy.ldm.hunyuan3d.vae
 import comfy.ldm.ace.vae.music_dcae_pipeline
 import yaml
 import math
@@ -42,8 +39,6 @@ import comfy.text_encoders.hunyuan_video
 import comfy.text_encoders.cosmos
 import comfy.text_encoders.lumina2
 import comfy.text_encoders.wan
 import comfy.text_encoders.hidream
 import comfy.text_encoders.ace
 import comfy.model_patcher
 import comfy.lora
@@ -122,7 +117,6 @@ class CLIP:
        self.layer_idx = None
        self.use_clip_schedule = False
        logging.info("CLIP/text encoder model load device: {}, offload device: {}, current: {}, dtype: {}".format(load_device, offload_device, params['device'], dtype))
        self.tokenizer_options = {}
    def clone(self):
        n = CLIP(no_init=True)
@@ -130,7 +124,6 @@ class CLIP:
        n.cond_stage_model = self.cond_stage_model
        n.tokenizer = self.tokenizer
        n.layer_idx = self.layer_idx
        n.tokenizer_options = self.tokenizer_options.copy()
        n.use_clip_schedule = self.use_clip_schedule
        n.apply_hooks_to_conds = self.apply_hooks_to_conds
        return n
@@ -138,19 +131,11 @@ class CLIP:
    def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
        return self.patcher.add_patches(patches, strength_patch, strength_model)
    def set_tokenizer_option(self, option_name, value):
        self.tokenizer_options[option_name] = value
    def clip_layer(self, layer_idx):
        self.layer_idx = layer_idx
-    def tokenize(self, text, return_word_ids=False, **kwargs):
+    def tokenize(self, text, return_word_ids=False):
-        tokenizer_options = kwargs.get("tokenizer_options", {})
+        return self.tokenizer.tokenize_with_weights(text, return_word_ids)
        if len(self.tokenizer_options) > 0:
            tokenizer_options = {**self.tokenizer_options, **tokenizer_options}
        if len(tokenizer_options) > 0:
            kwargs["tokenizer_options"] = tokenizer_options
        return self.tokenizer.tokenize_with_weights(text, return_word_ids, **kwargs)
    def add_hooks_to_dict(self, pooled_dict: dict[str]):
        if self.apply_hooks_to_conds:
@@ -264,7 +249,7 @@ class CLIP:
        return self.patcher.get_key_patches()
 class VAE:
-    def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None):
+    def __init__(self, sd=None, device=None, config=None, dtype=None):
        if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
            sd = diffusers_convert.convert_vae_state_dict(sd)
@@ -278,11 +263,9 @@ class VAE:
        self.process_input = lambda image: image * 2.0 - 1.0
        self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
        self.working_dtypes = [torch.bfloat16, torch.float32]
        self.disable_offload = False
        self.downscale_index_formula = None
        self.upscale_index_formula = None
        self.extra_1d_channel = None
        if config is None:
            if "decoder.mid.block_1.mix_factor" in sd:
@@ -352,7 +335,6 @@ class VAE:
                self.process_output = lambda audio: audio
                self.process_input = lambda audio: audio
                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
                self.disable_offload = True
            elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae
                if "blocks.2.blocks.3.stack.5.weight" in sd:
                    sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."})
@@ -375,12 +357,7 @@ class VAE:
                    version = 0
                elif tensor_conv1.shape[0] == 1024:
                    version = 1
-                    if "encoder.down_blocks.1.conv.conv.bias" in sd:
+                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version)
                        version = 2
                vae_config = None
                if metadata is not None and "config" in metadata:
                    vae_config = json.loads(metadata["config"]).get("vae", None)
                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version, config=vae_config)
                self.latent_channels = 128
                self.latent_dim = 3
                self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
@@ -429,31 +406,6 @@ class VAE:
                self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
                self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype)
                self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype)
            elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd:
                self.latent_dim = 1
                ln_post = "geo_decoder.ln_post.weight" in sd
                inner_size = sd["geo_decoder.output_proj.weight"].shape[1]
                downsample_ratio = sd["post_kl.weight"].shape[0] // inner_size
                mlp_expand = sd["geo_decoder.cross_attn_decoder.mlp.c_fc.weight"].shape[0] // inner_size
                self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype)  # TODO
                self.memory_used_decode = lambda shape, dtype: (1024 * 1024 * 1024 * 2.0) * model_management.dtype_size(dtype)  # TODO
                ddconfig = {"embed_dim": 64, "num_freqs": 8, "include_pi": False, "heads": 16, "width": 1024, "num_decoder_layers": 16, "qkv_bias": False, "qk_norm": True, "geo_decoder_mlp_expand_ratio": mlp_expand, "geo_decoder_downsample_ratio": downsample_ratio, "geo_decoder_ln_post": ln_post}
                self.first_stage_model = comfy.ldm.hunyuan3d.vae.ShapeVAE(**ddconfig)
                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
            elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio
                self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100)
                self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
                self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
                self.latent_channels = 8
                self.output_channels = 2
                self.upscale_ratio = 4096
                self.downscale_ratio = 4096
                self.latent_dim = 2
                self.process_output = lambda audio: audio
                self.process_input = lambda audio: audio
                self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
                self.disable_offload = True
                self.extra_1d_channel = 16
            else:
                logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
                self.first_stage_model = None
@@ -482,10 +434,6 @@ class VAE:
        self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device)
        logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype))
    def throw_exception_if_invalid(self):
        if self.first_stage_model is None:
            raise RuntimeError("ERROR: VAE is invalid: None\n\nIf the VAE is from a checkpoint loader node your checkpoint does not contain a valid VAE.")
    def vae_encode_crop_pixels(self, pixels):
        downscale_ratio = self.spacial_compression_encode()
@@ -512,13 +460,7 @@ class VAE:
        return output
    def decode_tiled_1d(self, samples, tile_x=128, overlap=32):
-        if samples.ndim == 3:
+        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
        else:
            og_shape = samples.shape
            samples = samples.reshape((og_shape[0], og_shape[1] * og_shape[2], -1))
            decode_fn = lambda a: self.first_stage_model.decode(a.reshape((-1, og_shape[1], og_shape[2], a.shape[-1])).to(self.vae_dtype).to(self.device)).float()
        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device))
    def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)):
@@ -538,49 +480,33 @@ class VAE:
        samples /= 3.0
        return samples
-    def encode_tiled_1d(self, samples, tile_x=256 * 2048, overlap=64 * 2048):
+    def encode_tiled_1d(self, samples, tile_x=128 * 2048, overlap=32 * 2048):
-        if self.latent_dim == 1:
+        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
-            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
+        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=(1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device)
            out_channels = self.latent_channels
            upscale_amount = 1 / self.downscale_ratio
        else:
            extra_channel_size = self.extra_1d_channel
            out_channels = self.latent_channels * extra_channel_size
            tile_x = tile_x // extra_channel_size
            overlap = overlap // extra_channel_size
            upscale_amount = 1 / self.downscale_ratio
            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).reshape(1, out_channels, -1).float()
        out = comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=upscale_amount, out_channels=out_channels, output_device=self.output_device)
        if self.latent_dim == 1:
            return out
        else:
            return out.reshape(samples.shape[0], self.latent_channels, extra_channel_size, -1)
    def encode_tiled_3d(self, samples, tile_t=9999, tile_x=512, tile_y=512, overlap=(1, 64, 64)):
        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, index_formulas=self.downscale_index_formula, output_device=self.output_device)
-    def decode(self, samples_in, vae_options={}):
+    def decode(self, samples_in):
        self.throw_exception_if_invalid()
        pixel_samples = None
        try:
            memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
-            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / memory_used)
            batch_number = max(1, batch_number)
            for x in range(0, samples_in.shape[0], batch_number):
                samples = samples_in[x:x+batch_number].to(self.vae_dtype).to(self.device)
-                out = self.process_output(self.first_stage_model.decode(samples, **vae_options).to(self.output_device).float())
+                out = self.process_output(self.first_stage_model.decode(samples).to(self.output_device).float())
                if pixel_samples is None:
                    pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
                pixel_samples[x:x+batch_number] = out
        except model_management.OOM_EXCEPTION:
            logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
            dims = samples_in.ndim - 2
-            if dims == 1 or self.extra_1d_channel is not None:
+            if dims == 1:
                pixel_samples = self.decode_tiled_1d(samples_in)
            elif dims == 2:
                pixel_samples = self.decode_tiled_(samples_in)
@@ -593,9 +519,8 @@ class VAE:
        return pixel_samples
    def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
        self.throw_exception_if_invalid()
        memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile
-        model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+        model_management.load_models_gpu([self.patcher], memory_required=memory_used)
        dims = samples.ndim - 2
        args = {}
        if tile_x is not None:
@@ -622,14 +547,13 @@ class VAE:
        return output.movedim(1, -1)
    def encode(self, pixel_samples):
        self.throw_exception_if_invalid()
        pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
        pixel_samples = pixel_samples.movedim(-1, 1)
        if self.latent_dim == 3 and pixel_samples.ndim < 5:
            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
        try:
            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
-            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / max(1, memory_used))
            batch_number = max(1, batch_number)
@@ -647,7 +571,7 @@ class VAE:
                tile = 256
                overlap = tile // 4
                samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap))
-            elif self.latent_dim == 1 or self.extra_1d_channel is not None:
+            elif self.latent_dim == 1:
                samples = self.encode_tiled_1d(pixel_samples)
            else:
                samples = self.encode_tiled_(pixel_samples)
@@ -655,7 +579,6 @@ class VAE:
        return samples
    def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
        self.throw_exception_if_invalid()
        pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
        dims = self.latent_dim
        pixel_samples = pixel_samples.movedim(-1, 1)
@@ -663,7 +586,7 @@ class VAE:
            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
        memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)  # TODO: calculate mem required for tile
-        model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+        model_management.load_models_gpu([self.patcher], memory_required=memory_used)
        args = {}
        if tile_x is not None:
@@ -751,9 +674,6 @@ class CLIPType(Enum):
    COSMOS = 11
    LUMINA2 = 12
    WAN = 13
    HIDREAM = 14
    CHROMA = 15
    ACE = 16
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@@ -842,9 +762,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            elif clip_type == CLIPType.SD3:
                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
            elif clip_type == CLIPType.HIDREAM:
                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
                clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
            else:
                clip_target.clip = sdxl_clip.SDXLRefinerClipModel
                clip_target.tokenizer = sdxl_clip.SDXLTokenizer
@@ -858,17 +775,13 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            elif clip_type == CLIPType.LTXV:
                clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
-            elif clip_type == CLIPType.PIXART or clip_type == CLIPType.CHROMA:
+            elif clip_type == CLIPType.PIXART:
                clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer
            elif clip_type == CLIPType.WAN:
                clip_target.clip = comfy.text_encoders.wan.te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer
                tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
            elif clip_type == CLIPType.HIDREAM:
                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
                                                                        clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
                clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
            else: #CLIPType.MOCHI
                clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
@@ -879,29 +792,16 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model
            clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer
        elif te_model == TEModel.T5_BASE:
-            if clip_type == CLIPType.ACE or "spiece_model" in clip_data[0]:
+            clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
-                clip_target.clip = comfy.text_encoders.ace.AceT5Model
+            clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
                clip_target.tokenizer = comfy.text_encoders.ace.AceT5Tokenizer
                tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
            else:
                clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
                clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
        elif te_model == TEModel.GEMMA_2_2B:
            clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer
            tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
        elif te_model == TEModel.LLAMA3_8:
            clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data),
                                                                        clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
            clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
        else:
            # clip_l
            if clip_type == CLIPType.SD3:
                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
            elif clip_type == CLIPType.HIDREAM:
                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
                clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
            else:
                clip_target.clip = sd1_clip.SD1ClipModel
                clip_target.tokenizer = sd1_clip.SD1Tokenizer
@@ -919,33 +819,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif clip_type == CLIPType.HUNYUAN_VIDEO:
            clip_target.clip = comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer
        elif clip_type == CLIPType.HIDREAM:
            # Detect
            hidream_dualclip_classes = []
            for hidream_te in clip_data:
                te_model = detect_te_model(hidream_te)
                hidream_dualclip_classes.append(te_model)
            clip_l = TEModel.CLIP_L in hidream_dualclip_classes
            clip_g = TEModel.CLIP_G in hidream_dualclip_classes
            t5 = TEModel.T5_XXL in hidream_dualclip_classes
            llama = TEModel.LLAMA3_8 in hidream_dualclip_classes
            # Initialize t5xxl_detect and llama_detect kwargs if needed
            t5_kwargs = t5xxl_detect(clip_data) if t5 else {}
            llama_kwargs = llama_detect(clip_data) if llama else {}
            clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, **t5_kwargs, **llama_kwargs)
            clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
    elif len(clip_data) == 3:
        clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(**t5xxl_detect(clip_data))
        clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
    elif len(clip_data) == 4:
        clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), **llama_detect(clip_data))
        clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
    parameters = 0
    for c in clip_data:
@@ -994,13 +873,13 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
    return (model, clip, vae)
 def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
-    sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
+    sd = comfy.utils.load_torch_file(ckpt_path)
-    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
+    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
    return out
-def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
+def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    clip = None
    clipvision = None
    vae = None
@@ -1012,14 +891,9 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
    load_device = model_management.get_torch_device()
-    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
+    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
    if model_config is None:
-        logging.warning("Warning, This is not a checkpoint file, trying to load it as a diffusion model only.")
+        return None
        diffusion_model = load_diffusion_model_state_dict(sd, model_options={})
        if diffusion_model is None:
            return None
        return (diffusion_model, None, VAE(sd={}), None)  # The VAE object is there to throw an exception if it's actually used'
    unet_weight_dtype = list(model_config.supported_inference_dtypes)
    if model_config.scaled_fp8 is not None:
@@ -1046,7 +920,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    if output_vae:
        vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True)
        vae_sd = model_config.process_vae_state_dict(vae_sd)
-        vae = VAE(sd=vae_sd, metadata=metadata)
+        vae = VAE(sd=vae_sd)
    if output_clip:
        clip_target = model_config.clip_target(state_dict=sd)
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -82,8 +82,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    LAYERS = [
        "last",
        "pooled",
-        "hidden",
+        "hidden"
        "all"
    ]
    def __init__(self, device="cpu", max_length=77,
                 freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=comfy.clip_model.CLIPTextModel,
@@ -94,8 +93,6 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        if textmodel_json_config is None:
            textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_clip_config.json")
            if "model_name" not in model_options:
                model_options = {**model_options, "model_name": "clip_l"}
        if isinstance(textmodel_json_config, dict):
            config = textmodel_json_config
@@ -103,10 +100,6 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
            with open(textmodel_json_config) as f:
                config = json.load(f)
        te_model_options = model_options.get("{}_model_config".format(model_options.get("model_name", "")), {})
        for k, v in te_model_options.items():
            config[k] = v
        operations = model_options.get("custom_operations", None)
        scaled_fp8 = None
@@ -154,9 +147,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    def set_clip_options(self, options):
        layer_idx = options.get("layer", self.layer_idx)
        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
-        if self.layer == "all":
+        if layer_idx is None or abs(layer_idx) > self.num_layers:
            pass
        elif layer_idx is None or abs(layer_idx) > self.num_layers:
            self.layer = "last"
        else:
            self.layer = "hidden"
@@ -167,98 +158,71 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        self.layer_idx = self.options_default[1]
        self.return_projected_pooled = self.options_default[2]
-    def process_tokens(self, tokens, device):
+    def set_up_textual_embeddings(self, tokens, current_embeds):
-        end_token = self.special_tokens.get("end", None)
+        out_tokens = []
-        if end_token is None:
+        next_new_token = token_dict_size = current_embeds.weight.shape[0]
-            cmp_token = self.special_tokens.get("pad", -1)
+        embedding_weights = []
        else:
            cmp_token = end_token
        embeds_out = []
        attention_masks = []
        num_tokens = []
        for x in tokens:
            attention_mask = []
            tokens_temp = []
            other_embeds = []
            eos = False
            index = 0
            for y in x:
                if isinstance(y, numbers.Integral):
-                    if eos:
+                    tokens_temp += [int(y)]
-                        attention_mask.append(0)
+                else:
                    if y.shape[0] == current_embeds.weight.shape[1]:
                        embedding_weights += [y]
                        tokens_temp += [next_new_token]
                        next_new_token += 1
                    else:
-                        attention_mask.append(1)
+                        logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(y.shape[0], current_embeds.weight.shape[1]))
-                    token = int(y)
+            while len(tokens_temp) < len(x):
-                    tokens_temp += [token]
+                tokens_temp += [self.special_tokens["pad"]]
-                    if not eos and token == cmp_token:
+            out_tokens += [tokens_temp]
                        if end_token is None:
                            attention_mask[-1] = 0
                        eos = True
                else:
                    other_embeds.append((index, y))
                index += 1
-            tokens_embed = torch.tensor([tokens_temp], device=device, dtype=torch.long)
+        n = token_dict_size
-            tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32)
+        if len(embedding_weights) > 0:
-            index = 0
+            new_embedding = self.operations.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
-            pad_extra = 0
+            new_embedding.weight[:token_dict_size] = current_embeds.weight
-            for o in other_embeds:
+            for x in embedding_weights:
-                emb = o[1]
+                new_embedding.weight[n] = x
-                if torch.is_tensor(emb):
+                n += 1
-                    emb = {"type": "embedding", "data": emb}
+            self.transformer.set_input_embeddings(new_embedding)
-                emb_type = emb.get("type", None)
+        processed_tokens = []
-                if emb_type == "embedding":
+        for x in out_tokens:
-                    emb = emb.get("data", None)
+            processed_tokens += [list(map(lambda a: n if a == -1 else a, x))] #The EOS token should always be the largest one
                else:
                    if hasattr(self.transformer, "preprocess_embed"):
                        emb = self.transformer.preprocess_embed(emb, device=device)
                    else:
                        emb = None
-                if emb is None:
+        return processed_tokens
                    index += -1
                    continue
                ind = index + o[0]
                emb = emb.view(1, -1, emb.shape[-1]).to(device=device, dtype=torch.float32)
                emb_shape = emb.shape[1]
                if emb.shape[-1] == tokens_embed.shape[-1]:
                    tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1)
                    attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:]
                    index += emb_shape - 1
                else:
                    index += -1
                    pad_extra += emb_shape
                    logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(emb.shape[-1], tokens_embed.shape[-1]))
            if pad_extra > 0:
                padd_embed = self.transformer.get_input_embeddings()(torch.tensor([[self.special_tokens["pad"]] * pad_extra], device=device, dtype=torch.long), out_dtype=torch.float32)
                tokens_embed = torch.cat([tokens_embed, padd_embed], dim=1)
                attention_mask = attention_mask + [0] * pad_extra
            embeds_out.append(tokens_embed)
            attention_masks.append(attention_mask)
            num_tokens.append(sum(attention_mask))
        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens
    def forward(self, tokens):
-        device = self.transformer.get_input_embeddings().weight.device
+        backup_embeds = self.transformer.get_input_embeddings()
-        embeds, attention_mask, num_tokens = self.process_tokens(tokens, device)
+        device = backup_embeds.weight.device
        tokens = self.set_up_textual_embeddings(tokens, backup_embeds)
        tokens = torch.LongTensor(tokens).to(device)
        attention_mask = None
        if self.enable_attention_masks or self.zero_out_masked or self.return_attention_masks:
            attention_mask = torch.zeros_like(tokens)
            end_token = self.special_tokens.get("end", None)
            if end_token is None:
                cmp_token = self.special_tokens.get("pad", -1)
            else:
                cmp_token = end_token
            for x in range(attention_mask.shape[0]):
                for y in range(attention_mask.shape[1]):
                    attention_mask[x, y] = 1
                    if tokens[x, y] == cmp_token:
                        if end_token is None:
                            attention_mask[x, y] = 0
                        break
        attention_mask_model = None
        if self.enable_attention_masks:
            attention_mask_model = attention_mask
-        if self.layer == "all":
+        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
-            intermediate_output = "all"
+        self.transformer.set_input_embeddings(backup_embeds)
        else:
            intermediate_output = self.layer_idx
        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
        if self.layer == "last":
            z = outputs[0].float()
@@ -457,14 +421,13 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
    return embed_out
 class SDTokenizer:
-    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data={}, tokenizer_args={}):
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, tokenizer_data={}, tokenizer_args={}):
        if tokenizer_path is None:
            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
-        self.max_length = tokenizer_data.get("{}_max_length".format(embedding_key), max_length)
+        self.max_length = max_length
        self.min_length = min_length
        self.end_token = None
        self.min_padding = min_padding
        empty = self.tokenizer('')["input_ids"]
        self.tokenizer_adds_end_token = has_end_token
@@ -519,15 +482,13 @@ class SDTokenizer:
        return (embed, leftover)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, tokenizer_options={}, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        '''
        Takes a prompt and converts it to a list of (token, weight, word id) elements.
        Tokens can both be integer tokens and pre computed CLIP tensors.
        Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens.
        Returned list has the dimensions NxM where M is the input size of CLIP
        '''
        min_length = tokenizer_options.get("{}_min_length".format(self.embedding_key), self.min_length)
        min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding)
        text = escape_important(text)
        parsed_weights = token_weights(text, 1.0)
@@ -606,12 +567,10 @@ class SDTokenizer:
        #fill last batch
        if self.end_token is not None:
            batch.append((self.end_token, 1.0, 0))
-        if min_padding is not None:
+        if self.pad_to_max_length:
            batch.extend([(self.pad_token, 1.0, 0)] * min_padding)
        if self.pad_to_max_length and len(batch) < self.max_length:
            batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch)))
-        if min_length is not None and len(batch) < min_length:
+        if self.min_length is not None and len(batch) < self.min_length:
-            batch.extend([(self.pad_token, 1.0, 0)] * (min_length - len(batch)))
+            batch.extend([(self.pad_token, 1.0, 0)] * (self.min_length - len(batch)))
        if not return_word_ids:
            batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens]
@@ -637,9 +596,9 @@ class SD1Tokenizer:
        tokenizer = tokenizer_data.get("{}_tokenizer_class".format(self.clip), tokenizer)
        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data))
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
-        out[self.clip_name] = getattr(self, self.clip).tokenize_with_weights(text, return_word_ids, **kwargs)
+        out[self.clip_name] = getattr(self, self.clip).tokenize_with_weights(text, return_word_ids)
        return out
    def untokenize(self, token_weight_pair):
@@ -664,7 +623,6 @@ class SD1ClipModel(torch.nn.Module):
            self.clip = "clip_{}".format(self.clip_name)
        clip_model = model_options.get("{}_class".format(self.clip), clip_model)
        model_options = {**model_options, "model_name": self.clip}
        setattr(self, self.clip, clip_model(device=device, dtype=dtype, model_options=model_options, **kwargs))
        self.dtypes = set()
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@@ -9,7 +9,6 @@ class SDXLClipG(sd1_clip.SDClipModel):
            layer_idx=-2
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        model_options = {**model_options, "model_name": "clip_g"}
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False, return_projected_pooled=True, model_options=model_options)
@@ -18,18 +17,19 @@ class SDXLClipG(sd1_clip.SDClipModel):
 class SDXLClipGTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g', tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')
 class SDXLTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
-        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
-        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
-        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
        return out
    def untokenize(self, token_weight_pair):
@@ -41,7 +41,8 @@ class SDXLTokenizer:
 class SDXLClipModel(torch.nn.Module):
    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__()
-        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, model_options=model_options)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
        self.clip_l = clip_l_class(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, model_options=model_options)
        self.clip_g = SDXLClipG(device=device, dtype=dtype, model_options=model_options)
        self.dtypes = set([dtype])
@@ -74,7 +75,7 @@ class SDXLRefinerClipModel(sd1_clip.SD1ClipModel):
 class StableCascadeClipGTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
-        super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g', tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')
 class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
@@ -83,7 +84,6 @@ class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
 class StableCascadeClipG(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        model_options = {**model_options, "model_name": "clip_g"}
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True, return_projected_pooled=True, model_options=model_options)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -17,7 +17,6 @@ import comfy.text_encoders.hunyuan_video
 import comfy.text_encoders.cosmos
 import comfy.text_encoders.lumina2
 import comfy.text_encoders.wan
 import comfy.text_encoders.ace
 from . import supported_models_base
 from . import latent_formats
@@ -507,22 +506,6 @@ class SDXL_instructpix2pix(SDXL):
    def get_model(self, state_dict, prefix="", device=None):
        return model_base.SDXL_instructpix2pix(self, model_type=self.model_type(state_dict, prefix), device=device)
 class LotusD(SD20):
    unet_config = {
        "model_channels": 320,
        "use_linear_in_transformer": True,
        "use_temporal_attention": False,
        "adm_in_channels": 4,
        "in_channels": 4,
    }
    unet_extra_config = {
        "num_classes": 'sequential'
    }
    def get_model(self, state_dict, prefix="", device=None):
        return model_base.Lotus(self, device=device)
 class SD3(supported_models_base.BASE):
    unet_config = {
        "in_channels": 16,
@@ -779,17 +762,13 @@ class LTXV(supported_models_base.BASE):
    unet_extra_config = {}
    latent_format = latent_formats.LTXV
-    memory_usage_factor = 5.5 # TODO: img2vid is about 2x vs txt2vid
+    memory_usage_factor = 2.7
    supported_inference_dtypes = [torch.bfloat16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def __init__(self, unet_config):
        super().__init__(unet_config)
        self.memory_usage_factor = (unet_config.get("cross_attention_dim", 2048) / 2048) * 5.5
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.LTXV(self, device=device)
        return out
@@ -847,26 +826,6 @@ class HunyuanVideo(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}llama.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer, comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**hunyuan_detect))
 class HunyuanVideoI2V(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
        "in_channels": 33,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HunyuanVideoI2V(self, device=device)
        return out
 class HunyuanVideoSkyreelsI2V(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
        "in_channels": 32,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HunyuanVideoSkyreelsI2V(self, device=device)
        return out
 class CosmosT2V(supported_models_base.BASE):
    unet_config = {
        "image_model": "cosmos",
@@ -952,7 +911,7 @@ class WAN21_T2V(supported_models_base.BASE):
    memory_usage_factor = 1.0
-    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
@@ -974,171 +933,12 @@ class WAN21_I2V(WAN21_T2V):
    unet_config = {
        "image_model": "wan2.1",
        "model_type": "i2v",
        "in_dim": 36,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.WAN21(self, image_to_video=True, device=device)
        return out
-class WAN21_FunControl2V(WAN21_T2V):
+models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V]
    unet_config = {
        "image_model": "wan2.1",
        "model_type": "i2v",
        "in_dim": 48,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.WAN21(self, image_to_video=False, device=device)
        return out
 class WAN21_Camera(WAN21_T2V):
    unet_config = {
        "image_model": "wan2.1",
        "model_type": "camera",
        "in_dim": 32,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
        return out
 class WAN21_Vace(WAN21_T2V):
    unet_config = {
        "image_model": "wan2.1",
        "model_type": "vace",
    }
    def __init__(self, unet_config):
        super().__init__(unet_config)
        self.memory_usage_factor = 1.2 * self.memory_usage_factor
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.WAN21_Vace(self, image_to_video=False, device=device)
        return out
 class Hunyuan3Dv2(supported_models_base.BASE):
    unet_config = {
        "image_model": "hunyuan3d2",
    }
    unet_extra_config = {}
    sampling_settings = {
        "multiplier": 1.0,
        "shift": 1.0,
    }
    memory_usage_factor = 3.5
    clip_vision_prefix = "conditioner.main_image_encoder.model."
    vae_key_prefix = ["vae."]
    latent_format = latent_formats.Hunyuan3Dv2
    def process_unet_state_dict_for_saving(self, state_dict):
        replace_prefix = {"": "model."}
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Hunyuan3Dv2(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        return None
 class Hunyuan3Dv2mini(Hunyuan3Dv2):
    unet_config = {
        "image_model": "hunyuan3d2",
        "depth": 8,
    }
    latent_format = latent_formats.Hunyuan3Dv2mini
 class HiDream(supported_models_base.BASE):
    unet_config = {
        "image_model": "hidream",
    }
    sampling_settings = {
        "shift": 3.0,
    }
    sampling_settings = {
    }
    # memory_usage_factor = 1.2 # TODO
    unet_extra_config = {}
    latent_format = latent_formats.Flux
    supported_inference_dtypes = [torch.bfloat16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HiDream(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        return None #  TODO
 class Chroma(supported_models_base.BASE):
    unet_config = {
        "image_model": "chroma",
    }
    unet_extra_config = {
    }
    sampling_settings = {
        "multiplier": 1.0,
    }
    latent_format = comfy.latent_formats.Flux
    memory_usage_factor = 3.2
    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Chroma(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.pixart_te(**t5_detect))
 class ACEStep(supported_models_base.BASE):
    unet_config = {
        "audio_model": "ace",
    }
    unet_extra_config = {
    }
    sampling_settings = {
        "shift": 3.0,
    }
    latent_format = comfy.latent_formats.ACEAudio
    memory_usage_factor = 0.5
    supported_inference_dtypes = [torch.bfloat16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.ACEStep(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        return supported_models_base.ClipTarget(comfy.text_encoders.ace.AceT5Tokenizer, comfy.text_encoders.ace.AceT5Model)
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep]
 models += [SVD_img2vid]
--- a/comfy/text_encoders/ace.py
+++ b/comfy/text_encoders/ace.py
@@ -1,153 +0,0 @@
 from comfy import sd1_clip
 from .spiece_tokenizer import SPieceTokenizer
 import comfy.text_encoders.t5
 import os
 import re
 import torch
 import logging
 from tokenizers import Tokenizer
 from .ace_text_cleaners import multilingual_cleaners, japanese_to_romaji
 SUPPORT_LANGUAGES = {
    "en": 259, "de": 260, "fr": 262, "es": 284, "it": 285,
    "pt": 286, "pl": 294, "tr": 295, "ru": 267, "cs": 293,
    "nl": 297, "ar": 5022, "zh": 5023, "ja": 5412, "hu": 5753,
    "ko": 6152, "hi": 6680
 }
 structure_pattern = re.compile(r"\[.*?\]")
 DEFAULT_VOCAB_FILE = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "ace_lyrics_tokenizer"), "vocab.json")
 class VoiceBpeTokenizer:
    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
        self.tokenizer = None
        if vocab_file is not None:
            self.tokenizer = Tokenizer.from_file(vocab_file)
    def preprocess_text(self, txt, lang):
        txt = multilingual_cleaners(txt, lang)
        return txt
    def encode(self, txt, lang='en'):
        # lang = lang.split("-")[0]  # remove the region
        # self.check_input_length(txt, lang)
        txt = self.preprocess_text(txt, lang)
        lang = "zh-cn" if lang == "zh" else lang
        txt = f"[{lang}]{txt}"
        txt = txt.replace(" ", "[SPACE]")
        return self.tokenizer.encode(txt).ids
    def get_lang(self, line):
        if line.startswith("[") and line[3:4] == ']':
            lang = line[1:3].lower()
            if lang in SUPPORT_LANGUAGES:
                return lang, line[4:]
        return "en", line
    def __call__(self, string):
        lines = string.split("\n")
        lyric_token_idx = [261]
        for line in lines:
            line = line.strip()
            if not line:
                lyric_token_idx += [2]
                continue
            lang, line = self.get_lang(line)
            if lang not in SUPPORT_LANGUAGES:
                lang = "en"
            if "zh" in lang:
                lang = "zh"
            if "spa" in lang:
                lang = "es"
            try:
                line_out = japanese_to_romaji(line)
                if line_out != line:
                    lang = "ja"
                line = line_out
            except:
                pass
            try:
                if structure_pattern.match(line):
                    token_idx = self.encode(line, "en")
                else:
                    token_idx = self.encode(line, lang)
                lyric_token_idx = lyric_token_idx + token_idx + [2]
            except Exception as e:
                logging.warning("tokenize error {} for line {} major_language {}".format(e, line, lang))
        return {"input_ids": lyric_token_idx}
    @staticmethod
    def from_pretrained(path, **kwargs):
        return VoiceBpeTokenizer(path, **kwargs)
    def get_vocab(self):
        return {}
 class UMT5BaseModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "umt5_config_base.json")
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=False, model_options=model_options)
 class UMT5BaseTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer = tokenizer_data.get("spiece_model", None)
        super().__init__(tokenizer, pad_with_end=False, embedding_size=768, embedding_key='umt5base', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=0, tokenizer_data=tokenizer_data)
    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
 class LyricsTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "ace_lyrics_tokenizer"), "vocab.json")
        super().__init__(tokenizer, pad_with_end=False, embedding_size=1024, embedding_key='lyrics', tokenizer_class=VoiceBpeTokenizer, has_start_token=True, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=2, has_end_token=False, tokenizer_data=tokenizer_data)
 class AceT5Tokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        self.voicebpe = LyricsTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.umt5base = UMT5BaseTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
        out = {}
        out["lyrics"] = self.voicebpe.tokenize_with_weights(kwargs.get("lyrics", ""), return_word_ids, **kwargs)
        out["umt5base"] = self.umt5base.tokenize_with_weights(text, return_word_ids, **kwargs)
        return out
    def untokenize(self, token_weight_pair):
        return self.umt5base.untokenize(token_weight_pair)
    def state_dict(self):
        return self.umt5base.state_dict()
 class AceT5Model(torch.nn.Module):
    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
        super().__init__()
        self.umt5base = UMT5BaseModel(device=device, dtype=dtype, model_options=model_options)
        self.dtypes = set()
        if dtype is not None:
            self.dtypes.add(dtype)
    def set_clip_options(self, options):
        self.umt5base.set_clip_options(options)
    def reset_clip_options(self):
        self.umt5base.reset_clip_options()
    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs_umt5base = token_weight_pairs["umt5base"]
        token_weight_pairs_lyrics = token_weight_pairs["lyrics"]
        t5_out, t5_pooled = self.umt5base.encode_token_weights(token_weight_pairs_umt5base)
        lyrics_embeds = torch.tensor(list(map(lambda a: a[0], token_weight_pairs_lyrics[0]))).unsqueeze(0)
        return t5_out, None, {"conditioning_lyrics": lyrics_embeds}
    def load_sd(self, sd):
        return self.umt5base.load_sd(sd)
--- a/comfy/text_encoders/ace_lyrics_tokenizer/vocab.json
+++ b/comfy/text_encoders/ace_lyrics_tokenizer/vocab.json
--- a/comfy/text_encoders/ace_text_cleaners.py
+++ b/comfy/text_encoders/ace_text_cleaners.py
@@ -1,395 +0,0 @@
 # basic text cleaners for the ACE step model
 # I didn't copy the ones from the reference code because I didn't want to deal with the dependencies
 # TODO: more languages than english?
 import re
 def japanese_to_romaji(japanese_text):
    """
    Convert Japanese hiragana and katakana to romaji (Latin alphabet representation).
    Args:
        japanese_text (str): Text containing hiragana and/or katakana characters
    Returns:
        str: The romaji (Latin alphabet) equivalent
    """
    # Dictionary mapping kana characters to their romaji equivalents
    kana_map = {
        # Katakana characters
        'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o',
        'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko',
        'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so',
        'タ': 'ta', 'チ': 'chi', 'ツ': 'tsu', 'テ': 'te', 'ト': 'to',
        'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no',
        'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho',
        'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo',
        'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo',
        'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro',
        'ワ': 'wa', 'ヲ': 'wo', 'ン': 'n',
        # Katakana voiced consonants
        'ガ': 'ga', 'ギ': 'gi', 'グ': 'gu', 'ゲ': 'ge', 'ゴ': 'go',
        'ザ': 'za', 'ジ': 'ji', 'ズ': 'zu', 'ゼ': 'ze', 'ゾ': 'zo',
        'ダ': 'da', 'ヂ': 'ji', 'ヅ': 'zu', 'デ': 'de', 'ド': 'do',
        'バ': 'ba', 'ビ': 'bi', 'ブ': 'bu', 'ベ': 'be', 'ボ': 'bo',
        'パ': 'pa', 'ピ': 'pi', 'プ': 'pu', 'ペ': 'pe', 'ポ': 'po',
        # Katakana combinations
        'キャ': 'kya', 'キュ': 'kyu', 'キョ': 'kyo',
        'シャ': 'sha', 'シュ': 'shu', 'ショ': 'sho',
        'チャ': 'cha', 'チュ': 'chu', 'チョ': 'cho',
        'ニャ': 'nya', 'ニュ': 'nyu', 'ニョ': 'nyo',
        'ヒャ': 'hya', 'ヒュ': 'hyu', 'ヒョ': 'hyo',
        'ミャ': 'mya', 'ミュ': 'myu', 'ミョ': 'myo',
        'リャ': 'rya', 'リュ': 'ryu', 'リョ': 'ryo',
        'ギャ': 'gya', 'ギュ': 'gyu', 'ギョ': 'gyo',
        'ジャ': 'ja', 'ジュ': 'ju', 'ジョ': 'jo',
        'ビャ': 'bya', 'ビュ': 'byu', 'ビョ': 'byo',
        'ピャ': 'pya', 'ピュ': 'pyu', 'ピョ': 'pyo',
        # Katakana small characters and special cases
        'ッ': '', # Small tsu (doubles the following consonant)
        'ャ': 'ya', 'ュ': 'yu', 'ョ': 'yo',
        # Katakana extras
        'ヴ': 'vu', 'ファ': 'fa', 'フィ': 'fi', 'フェ': 'fe', 'フォ': 'fo',
        'ウィ': 'wi', 'ウェ': 'we', 'ウォ': 'wo',
        # Hiragana characters
        'あ': 'a', 'い': 'i', 'う': 'u', 'え': 'e', 'お': 'o',
        'か': 'ka', 'き': 'ki', 'く': 'ku', 'け': 'ke', 'こ': 'ko',
        'さ': 'sa', 'し': 'shi', 'す': 'su', 'せ': 'se', 'そ': 'so',
        'た': 'ta', 'ち': 'chi', 'つ': 'tsu', 'て': 'te', 'と': 'to',
        'な': 'na', 'に': 'ni', 'ぬ': 'nu', 'ね': 'ne', 'の': 'no',
        'は': 'ha', 'ひ': 'hi', 'ふ': 'fu', 'へ': 'he', 'ほ': 'ho',
        'ま': 'ma', 'み': 'mi', 'む': 'mu', 'め': 'me', 'も': 'mo',
        'や': 'ya', 'ゆ': 'yu', 'よ': 'yo',
        'ら': 'ra', 'り': 'ri', 'る': 'ru', 'れ': 're', 'ろ': 'ro',
        'わ': 'wa', 'を': 'wo', 'ん': 'n',
        # Hiragana voiced consonants
        'が': 'ga', 'ぎ': 'gi', 'ぐ': 'gu', 'げ': 'ge', 'ご': 'go',
        'ざ': 'za', 'じ': 'ji', 'ず': 'zu', 'ぜ': 'ze', 'ぞ': 'zo',
        'だ': 'da', 'ぢ': 'ji', 'づ': 'zu', 'で': 'de', 'ど': 'do',
        'ば': 'ba', 'び': 'bi', 'ぶ': 'bu', 'べ': 'be', 'ぼ': 'bo',
        'ぱ': 'pa', 'ぴ': 'pi', 'ぷ': 'pu', 'ぺ': 'pe', 'ぽ': 'po',
        # Hiragana combinations
        'きゃ': 'kya', 'きゅ': 'kyu', 'きょ': 'kyo',
        'しゃ': 'sha', 'しゅ': 'shu', 'しょ': 'sho',
        'ちゃ': 'cha', 'ちゅ': 'chu', 'ちょ': 'cho',
        'にゃ': 'nya', 'にゅ': 'nyu', 'にょ': 'nyo',
        'ひゃ': 'hya', 'ひゅ': 'hyu', 'ひょ': 'hyo',
        'みゃ': 'mya', 'みゅ': 'myu', 'みょ': 'myo',
        'りゃ': 'rya', 'りゅ': 'ryu', 'りょ': 'ryo',
        'ぎゃ': 'gya', 'ぎゅ': 'gyu', 'ぎょ': 'gyo',
        'じゃ': 'ja', 'じゅ': 'ju', 'じょ': 'jo',
        'びゃ': 'bya', 'びゅ': 'byu', 'びょ': 'byo',
        'ぴゃ': 'pya', 'ぴゅ': 'pyu', 'ぴょ': 'pyo',
        # Hiragana small characters and special cases
        'っ': '', # Small tsu (doubles the following consonant)
        'ゃ': 'ya', 'ゅ': 'yu', 'ょ': 'yo',
        # Common punctuation and spaces
        '　': ' ', # Japanese space
        '、': ', ', '。': '. ',
    }
    result = []
    i = 0
    while i < len(japanese_text):
        # Check for small tsu (doubling the following consonant)
        if i < len(japanese_text) - 1 and (japanese_text[i] == 'っ' or japanese_text[i] == 'ッ'):
            if i < len(japanese_text) - 1 and japanese_text[i+1] in kana_map:
                next_romaji = kana_map[japanese_text[i+1]]
                if next_romaji and next_romaji[0] not in 'aiueon':
                    result.append(next_romaji[0])  # Double the consonant
            i += 1
            continue
        # Check for combinations with small ya, yu, yo
        if i < len(japanese_text) - 1 and japanese_text[i+1] in ('ゃ', 'ゅ', 'ょ', 'ャ', 'ュ', 'ョ'):
            combo = japanese_text[i:i+2]
            if combo in kana_map:
                result.append(kana_map[combo])
                i += 2
                continue
        # Regular character
        if japanese_text[i] in kana_map:
            result.append(kana_map[japanese_text[i]])
        else:
            # If it's not in our map, keep it as is (might be kanji, romaji, etc.)
            result.append(japanese_text[i])
        i += 1
    return ''.join(result)
 def number_to_text(num, ordinal=False):
    """
    Convert a number (int or float) to its text representation.
    Args:
        num: The number to convert
    Returns:
        str: Text representation of the number
    """
    if not isinstance(num, (int, float)):
        return "Input must be a number"
    # Handle special case of zero
    if num == 0:
        return "zero"
    # Handle negative numbers
    negative = num < 0
    num = abs(num)
    # Handle floats
    if isinstance(num, float):
        # Split into integer and decimal parts
        int_part = int(num)
        # Convert both parts
        int_text = _int_to_text(int_part)
        # Handle decimal part (convert to string and remove '0.')
        decimal_str = str(num).split('.')[1]
        decimal_text = " point " + " ".join(_digit_to_text(int(digit)) for digit in decimal_str)
        result = int_text + decimal_text
    else:
        # Handle integers
        result = _int_to_text(num)
    # Add 'negative' prefix for negative numbers
    if negative:
        result = "negative " + result
    return result
 def _int_to_text(num):
    """Helper function to convert an integer to text"""
    ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
            "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
            "seventeen", "eighteen", "nineteen"]
    tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
    if num < 20:
        return ones[num]
    if num < 100:
        return tens[num // 10] + (" " + ones[num % 10] if num % 10 != 0 else "")
    if num < 1000:
        return ones[num // 100] + " hundred" + (" " + _int_to_text(num % 100) if num % 100 != 0 else "")
    if num < 1000000:
        return _int_to_text(num // 1000) + " thousand" + (" " + _int_to_text(num % 1000) if num % 1000 != 0 else "")
    if num < 1000000000:
        return _int_to_text(num // 1000000) + " million" + (" " + _int_to_text(num % 1000000) if num % 1000000 != 0 else "")
    return _int_to_text(num // 1000000000) + " billion" + (" " + _int_to_text(num % 1000000000) if num % 1000000000 != 0 else "")
 def _digit_to_text(digit):
    """Convert a single digit to text"""
    digits = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    return digits[digit]
 _whitespace_re = re.compile(r"\s+")
 # List of (regular expression, replacement) pairs for abbreviations:
 _abbreviations = {
    "en": [
        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
        for x in [
            ("mrs", "misess"),
            ("mr", "mister"),
            ("dr", "doctor"),
            ("st", "saint"),
            ("co", "company"),
            ("jr", "junior"),
            ("maj", "major"),
            ("gen", "general"),
            ("drs", "doctors"),
            ("rev", "reverend"),
            ("lt", "lieutenant"),
            ("hon", "honorable"),
            ("sgt", "sergeant"),
            ("capt", "captain"),
            ("esq", "esquire"),
            ("ltd", "limited"),
            ("col", "colonel"),
            ("ft", "fort"),
        ]
    ],
 }
 def expand_abbreviations_multilingual(text, lang="en"):
    for regex, replacement in _abbreviations[lang]:
        text = re.sub(regex, replacement, text)
    return text
 _symbols_multilingual = {
    "en": [
        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
        for x in [
            ("&", " and "),
            ("@", " at "),
            ("%", " percent "),
            ("#", " hash "),
            ("$", " dollar "),
            ("£", " pound "),
            ("°", " degree "),
        ]
    ],
 }
 def expand_symbols_multilingual(text, lang="en"):
    for regex, replacement in _symbols_multilingual[lang]:
        text = re.sub(regex, replacement, text)
        text = text.replace("  ", " ")  # Ensure there are no double spaces
    return text.strip()
 _ordinal_re = {
    "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
 }
 _number_re = re.compile(r"[0-9]+")
 _currency_re = {
    "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
    "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
    "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
 }
 _comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
 _dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
 _decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
 def _remove_commas(m):
    text = m.group(0)
    if "," in text:
        text = text.replace(",", "")
    return text
 def _remove_dots(m):
    text = m.group(0)
    if "." in text:
        text = text.replace(".", "")
    return text
 def _expand_decimal_point(m, lang="en"):
    amount = m.group(1).replace(",", ".")
    return number_to_text(float(amount))
 def _expand_currency(m, lang="en", currency="USD"):
    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
    full_amount = number_to_text(amount)
    and_equivalents = {
        "en": ", ",
        "es": " con ",
        "fr": " et ",
        "de": " und ",
        "pt": " e ",
        "it": " e ",
        "pl": ", ",
        "cs": ", ",
        "ru": ", ",
        "nl": ", ",
        "ar": ", ",
        "tr": ", ",
        "hu": ", ",
        "ko": ", ",
    }
    if amount.is_integer():
        last_and = full_amount.rfind(and_equivalents[lang])
        if last_and != -1:
            full_amount = full_amount[:last_and]
    return full_amount
 def _expand_ordinal(m, lang="en"):
    return number_to_text(int(m.group(1)), ordinal=True)
 def _expand_number(m, lang="en"):
    return number_to_text(int(m.group(0)))
 def expand_numbers_multilingual(text, lang="en"):
    if lang in ["en", "ru"]:
        text = re.sub(_comma_number_re, _remove_commas, text)
    else:
        text = re.sub(_dot_number_re, _remove_dots, text)
    try:
        text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
        text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
        text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
    except:
        pass
    text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
    text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
    text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
    return text
 def lowercase(text):
    return text.lower()
 def collapse_whitespace(text):
    return re.sub(_whitespace_re, " ", text)
 def multilingual_cleaners(text, lang):
    text = text.replace('"', "")
    if lang == "tr":
        text = text.replace("İ", "i")
        text = text.replace("Ö", "ö")
        text = text.replace("Ü", "ü")
    text = lowercase(text)
    try:
        text = expand_numbers_multilingual(text, lang)
    except:
        pass
    try:
        text = expand_abbreviations_multilingual(text, lang)
    except:
        pass
    try:
        text = expand_symbols_multilingual(text, lang=lang)
    except:
        pass
    text = collapse_whitespace(text)
    return text
 def basic_cleaners(text):
    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
    text = lowercase(text)
    text = collapse_whitespace(text)
    return text
--- a/comfy/text_encoders/aura_t5.py
+++ b/comfy/text_encoders/aura_t5.py
@@ -11,7 +11,7 @@ class PT5XlModel(sd1_clip.SDClipModel):
 class PT5XlTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_tokenizer"), "tokenizer.model")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)
 class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -93,11 +93,8 @@ class BertEmbeddings(torch.nn.Module):
        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)
-    def forward(self, input_tokens, embeds=None, token_type_ids=None, dtype=None):
+    def forward(self, input_tokens, token_type_ids=None, dtype=None):
-        if embeds is not None:
+        x = self.word_embeddings(input_tokens, out_dtype=dtype)
            x = embeds
        else:
            x = self.word_embeddings(input_tokens, out_dtype=dtype)
        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
        if token_type_ids is not None:
            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
@@ -116,8 +113,8 @@ class BertModel_(torch.nn.Module):
        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)
-    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype)
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
--- a/comfy/text_encoders/cosmos.py
+++ b/comfy/text_encoders/cosmos.py
@@ -22,7 +22,7 @@ class CosmosT5XXL(sd1_clip.SD1ClipModel):
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=1024, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=1024, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=512)
 class CosmosT5Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -9,18 +9,19 @@ import os
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
 class FluxTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
-        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
-        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
-        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
        return out
    def untokenize(self, token_weight_pair):
@@ -34,7 +35,8 @@ class FluxClipModel(torch.nn.Module):
    def __init__(self, dtype_t5=None, device="cpu", dtype=None, model_options={}):
        super().__init__()
        dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
-        self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
        self.t5xxl = comfy.text_encoders.sd3_clip.T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options)
        self.dtypes = set([dtype, dtype_t5])
--- a/comfy/text_encoders/genmo.py
+++ b/comfy/text_encoders/genmo.py
@@ -18,7 +18,7 @@ class MochiT5XXL(sd1_clip.SD1ClipModel):
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
 class MochiT5Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/hidream.py
+++ b/comfy/text_encoders/hidream.py
@@ -1,155 +0,0 @@
 from . import hunyuan_video
 from . import sd3_clip
 from comfy import sd1_clip
 from comfy import sdxl_clip
 import comfy.model_management
 import torch
 import logging
 class HiDreamTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.t5xxl = sd3_clip.T5XXLTokenizer(embedding_directory=embedding_directory, min_length=128, max_length=128, tokenizer_data=tokenizer_data)
        self.llama = hunyuan_video.LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=128, pad_token=128009, tokenizer_data=tokenizer_data)
    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
        out = {}
        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids, **kwargs)
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
        t5xxl = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs)
        out["t5xxl"] = [t5xxl[0]]  # Use only first 128 tokens
        out["llama"] = self.llama.tokenize_with_weights(text, return_word_ids, **kwargs)
        return out
    def untokenize(self, token_weight_pair):
        return self.clip_g.untokenize(token_weight_pair)
    def state_dict(self):
        return {}
 class HiDreamTEModel(torch.nn.Module):
    def __init__(self, clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, device="cpu", dtype=None, model_options={}):
        super().__init__()
        self.dtypes = set()
        if clip_l:
            self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=True, model_options=model_options)
            self.dtypes.add(dtype)
        else:
            self.clip_l = None
        if clip_g:
            self.clip_g = sdxl_clip.SDXLClipG(device=device, dtype=dtype, model_options=model_options)
            self.dtypes.add(dtype)
        else:
            self.clip_g = None
        if t5:
            dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
            self.t5xxl = sd3_clip.T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options, attention_mask=True)
            self.dtypes.add(dtype_t5)
        else:
            self.t5xxl = None
        if llama:
            dtype_llama = comfy.model_management.pick_weight_dtype(dtype_llama, dtype, device)
            if "vocab_size" not in model_options:
                model_options["vocab_size"] = 128256
            self.llama = hunyuan_video.LLAMAModel(device=device, dtype=dtype_llama, model_options=model_options, layer="all", layer_idx=None, special_tokens={"start": 128000, "pad": 128009})
            self.dtypes.add(dtype_llama)
        else:
            self.llama = None
        logging.debug("Created HiDream text encoder with: clip_l {}, clip_g {}, t5xxl {}:{}, llama {}:{}".format(clip_l, clip_g, t5, dtype_t5, llama, dtype_llama))
    def set_clip_options(self, options):
        if self.clip_l is not None:
            self.clip_l.set_clip_options(options)
        if self.clip_g is not None:
            self.clip_g.set_clip_options(options)
        if self.t5xxl is not None:
            self.t5xxl.set_clip_options(options)
        if self.llama is not None:
            self.llama.set_clip_options(options)
    def reset_clip_options(self):
        if self.clip_l is not None:
            self.clip_l.reset_clip_options()
        if self.clip_g is not None:
            self.clip_g.reset_clip_options()
        if self.t5xxl is not None:
            self.t5xxl.reset_clip_options()
        if self.llama is not None:
            self.llama.reset_clip_options()
    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs_l = token_weight_pairs["l"]
        token_weight_pairs_g = token_weight_pairs["g"]
        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
        token_weight_pairs_llama = token_weight_pairs["llama"]
        lg_out = None
        pooled = None
        extra = {}
        if len(token_weight_pairs_g) > 0 or len(token_weight_pairs_l) > 0:
            if self.clip_l is not None:
                lg_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
            else:
                l_pooled = torch.zeros((1, 768), device=comfy.model_management.intermediate_device())
            if self.clip_g is not None:
                g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
            else:
                g_pooled = torch.zeros((1, 1280), device=comfy.model_management.intermediate_device())
            pooled = torch.cat((l_pooled, g_pooled), dim=-1)
        if self.t5xxl is not None:
            t5_output = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
            t5_out, t5_pooled = t5_output[:2]
        else:
            t5_out = None
        if self.llama is not None:
            ll_output = self.llama.encode_token_weights(token_weight_pairs_llama)
            ll_out, ll_pooled = ll_output[:2]
            ll_out = ll_out[:, 1:]
        else:
            ll_out = None
        if t5_out is None:
            t5_out = torch.zeros((1, 128, 4096), device=comfy.model_management.intermediate_device())
        if ll_out is None:
            ll_out = torch.zeros((1, 32, 1, 4096), device=comfy.model_management.intermediate_device())
        if pooled is None:
            pooled = torch.zeros((1, 768 + 1280), device=comfy.model_management.intermediate_device())
        extra["conditioning_llama3"] = ll_out
        return t5_out, pooled, extra
    def load_sd(self, sd):
        if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
            return self.clip_g.load_sd(sd)
        elif "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
            return self.clip_l.load_sd(sd)
        elif "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in sd:
            return self.t5xxl.load_sd(sd)
        else:
            return self.llama.load_sd(sd)
 def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None):
    class HiDreamTEModel_(HiDreamTEModel):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
                model_options = model_options.copy()
                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
            if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
                model_options = model_options.copy()
                model_options["llama_scaled_fp8"] = llama_scaled_fp8
            super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, dtype_t5=dtype_t5, dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
    return HiDreamTEModel_
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@@ -4,7 +4,6 @@ import comfy.text_encoders.llama
 from transformers import LlamaTokenizerFast
 import torch
 import os
 import numbers
 def llama_detect(state_dict, prefix=""):
@@ -21,49 +20,33 @@ def llama_detect(state_dict, prefix=""):
 class LLAMA3Tokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}, min_length=256, pad_token=128258):
+    def __init__(self, embedding_directory=None, tokenizer_data={}, min_length=256):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "llama_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=pad_token, min_length=min_length, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=128258, end_token=128009, min_length=min_length)
 class LLAMAModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}, special_tokens={"start": 128000, "pad": 128258}):
+    def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}):
        llama_scaled_fp8 = model_options.get("llama_scaled_fp8", None)
        if llama_scaled_fp8 is not None:
            model_options = model_options.copy()
            model_options["scaled_fp8"] = llama_scaled_fp8
-        textmodel_json_config = {}
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 128000, "pad": 128258}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Llama2, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
        vocab_size = model_options.get("vocab_size", None)
        if vocab_size is not None:
            textmodel_json_config["vocab_size"] = vocab_size
        model_options = {**model_options, "model_name": "llama"}
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens=special_tokens, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Llama2, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
 class HunyuanVideoTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
-        self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"""  # 95 tokens
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
-        self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1, tokenizer_data=tokenizer_data)
+        self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"""  # 95 tokens
        self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1)
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, image_embeds=None, image_interleave=1, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
-        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
-        if llama_template is None:
+        llama_text = "{}{}".format(self.llama_template, text)
-            llama_text = self.llama_template.format(text)
+        out["llama"] = self.llama.tokenize_with_weights(llama_text, return_word_ids)
        else:
            llama_text = llama_template.format(text)
        llama_text_tokens = self.llama.tokenize_with_weights(llama_text, return_word_ids, **kwargs)
        embed_count = 0
        for r in llama_text_tokens:
            for i in range(len(r)):
                if r[i][0] == 128257:
                    if image_embeds is not None and embed_count < image_embeds.shape[0]:
                        r[i] = ({"type": "embedding", "data": image_embeds[embed_count], "original_type": "image", "image_interleave": image_interleave},) + r[i][1:]
                        embed_count += 1
        out["llama"] = llama_text_tokens
        return out
    def untokenize(self, token_weight_pair):
@@ -77,7 +60,8 @@ class HunyuanVideoClipModel(torch.nn.Module):
    def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}):
        super().__init__()
        dtype_llama = comfy.model_management.pick_weight_dtype(dtype_llama, dtype, device)
-        self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
        self.llama = LLAMAModel(device=device, dtype=dtype_llama, model_options=model_options)
        self.dtypes = set([dtype, dtype_llama])
@@ -96,51 +80,20 @@ class HunyuanVideoClipModel(torch.nn.Module):
        llama_out, llama_pooled, llama_extra_out = self.llama.encode_token_weights(token_weight_pairs_llama)
        template_end = 0
-        extra_template_end = 0
+        for i, v in enumerate(token_weight_pairs_llama[0]):
-        extra_sizes = 0
+            if v[0] == 128007:  # <|end_header_id|>
-        user_end = 9999999999999
+                template_end = i
        images = []
        tok_pairs = token_weight_pairs_llama[0]
        for i, v in enumerate(tok_pairs):
            elem = v[0]
            if not torch.is_tensor(elem):
                if isinstance(elem, numbers.Integral):
                    if elem == 128006:
                        if tok_pairs[i + 1][0] == 882:
                            if tok_pairs[i + 2][0] == 128007:
                                template_end = i + 2
                                user_end = -1
                    if elem == 128009 and user_end == -1:
                        user_end = i + 1
                else:
                    if elem.get("original_type") == "image":
                        elem_size = elem.get("data").shape[0]
                        if template_end > 0:
                            if user_end == -1:
                                extra_template_end += elem_size - 1
                        else:
                            image_start = i + extra_sizes
                            image_end = i + elem_size + extra_sizes
                            images.append((image_start, image_end, elem.get("image_interleave", 1)))
                            extra_sizes += elem_size - 1
        if llama_out.shape[1] > (template_end + 2):
-            if tok_pairs[template_end + 1][0] == 271:
+            if token_weight_pairs_llama[0][template_end + 1][0] == 271:
                template_end += 2
-        llama_output = llama_out[:, template_end + extra_sizes:user_end + extra_sizes + extra_template_end]
+        llama_out = llama_out[:, template_end:]
-        llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end + extra_sizes:user_end + extra_sizes + extra_template_end]
+        llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end:]
        if llama_extra_out["attention_mask"].sum() == torch.numel(llama_extra_out["attention_mask"]):
            llama_extra_out.pop("attention_mask")  # attention mask is useless if no masked elements
        if len(images) > 0:
            out = []
            for i in images:
                out.append(llama_out[:, i[0]: i[1]: i[2]])
            llama_output = torch.cat(out + [llama_output], dim=1)
        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
-        return llama_output, l_pooled, llama_extra_out
+        return llama_out, l_pooled, llama_extra_out
    def load_sd(self, sd):
        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
--- a/comfy/text_encoders/hydit.py
+++ b/comfy/text_encoders/hydit.py
@@ -9,26 +9,24 @@ import torch
 class HyditBertModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip.json")
        model_options = {**model_options, "model_name": "hydit_clip"}
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 101, "end": 102, "pad": 0}, model_class=BertModel, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
 class HyditBertTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='chinese_roberta', tokenizer_class=BertTokenizer, pad_to_max_length=False, max_length=512, min_length=77, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='chinese_roberta', tokenizer_class=BertTokenizer, pad_to_max_length=False, max_length=512, min_length=77)
 class MT5XLModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_config_xl.json")
        model_options = {**model_options, "model_name": "mt5xl"}
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
 class MT5XLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        #tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_tokenizer"), "spiece.model")
        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=2048, embedding_key='mt5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=2048, embedding_key='mt5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
@@ -37,12 +35,12 @@ class HyditTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        mt5_tokenizer_data = tokenizer_data.get("mt5xl.spiece_model", None)
        self.hydit_clip = HyditBertTokenizer(embedding_directory=embedding_directory)
-        self.mt5xl = MT5XLTokenizer(tokenizer_data={**tokenizer_data, "spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)
+        self.mt5xl = MT5XLTokenizer(tokenizer_data={"spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
-        out["hydit_clip"] = self.hydit_clip.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["hydit_clip"] = self.hydit_clip.tokenize_with_weights(text, return_word_ids)
-        out["mt5xl"] = self.mt5xl.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["mt5xl"] = self.mt5xl.tokenize_with_weights(text, return_word_ids)
        return out
    def untokenize(self, token_weight_pair):
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -241,11 +241,8 @@ class Llama2_(nn.Module):
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
-    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        if embeds is not None:
+        x = self.embed_tokens(x, out_dtype=dtype)
            x = embeds
        else:
            x = self.embed_tokens(x, out_dtype=dtype)
        if self.normalize_in:
            x *= self.config.hidden_size ** 0.5
@@ -268,17 +265,11 @@ class Llama2_(nn.Module):
        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
        intermediate = None
        all_intermediate = None
        if intermediate_output is not None:
-            if intermediate_output == "all":
+            if intermediate_output < 0:
                all_intermediate = []
                intermediate_output = None
            elif intermediate_output < 0:
                intermediate_output = len(self.layers) + intermediate_output
        for i, layer in enumerate(self.layers):
            if all_intermediate is not None:
                all_intermediate.append(x.unsqueeze(1).clone())
            x = layer(
                x=x,
                attention_mask=mask,
@@ -289,12 +280,6 @@ class Llama2_(nn.Module):
                intermediate = x.clone()
        x = self.norm(x)
        if all_intermediate is not None:
            all_intermediate.append(x.unsqueeze(1).clone())
        if all_intermediate is not None:
            intermediate = torch.cat(all_intermediate, dim=1)
        if intermediate is not None and final_layer_norm_intermediate:
            intermediate = self.norm(intermediate)
--- a/comfy/text_encoders/long_clipl.json
+++ b/comfy/text_encoders/long_clipl.json
@@ -0,0 +1,25 @@
 {
  "_name_or_path": "openai/clip-vit-large-patch14",
  "architectures": [
    "CLIPTextModel"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "dropout": 0.0,
  "eos_token_id": 49407,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 248,
  "model_type": "clip_text_model",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "projection_dim": 768,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size": 49408
 }
--- a/comfy/text_encoders/long_clipl.py
+++ b/comfy/text_encoders/long_clipl.py
@@ -1,27 +1,30 @@
 from comfy import sd1_clip
 import os
 class LongClipTokenizer_(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(max_length=248, embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
 class LongClipModel_(sd1_clip.SDClipModel):
    def __init__(self, *args, **kwargs):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "long_clipl.json")
        super().__init__(*args, textmodel_json_config=textmodel_json_config, **kwargs)
 class LongClipTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, tokenizer=LongClipTokenizer_)
 class LongClipModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_model=LongClipModel_, **kwargs)
 def model_options_long_clip(sd, tokenizer_data, model_options):
    w = sd.get("clip_l.text_model.embeddings.position_embedding.weight", None)
    if w is None:
        w = sd.get("clip_g.text_model.embeddings.position_embedding.weight", None)
    else:
        model_name = "clip_g"
    if w is None:
        w = sd.get("text_model.embeddings.position_embedding.weight", None)
-        if w is not None:
+    if w is not None and w.shape[0] == 248:
            if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
                model_name = "clip_g"
            elif "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
                model_name = "clip_l"
    else:
        model_name = "clip_l"
    if w is not None:
        tokenizer_data = tokenizer_data.copy()
        model_options = model_options.copy()
-        model_config = model_options.get("model_config", {})
+        tokenizer_data["clip_l_tokenizer_class"] = LongClipTokenizer_
-        model_config["max_position_embeddings"] = w.shape[0]
+        model_options["clip_l_class"] = LongClipModel_
        model_options["{}_model_config".format(model_name)] = model_config
        tokenizer_data["{}_max_length".format(model_name)] = w.shape[0]
    return tokenizer_data, model_options
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -6,7 +6,7 @@ import comfy.text_encoders.genmo
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128, tokenizer_data=tokenizer_data) #pad to 128?
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128) #pad to 128?
 class LTXVT5Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/lumina2.py
+++ b/comfy/text_encoders/lumina2.py
@@ -6,7 +6,7 @@ import comfy.text_encoders.llama
 class Gemma2BTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False})
    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
--- a/comfy/text_encoders/pixart_t5.py
+++ b/comfy/text_encoders/pixart_t5.py
@@ -24,7 +24,7 @@ class PixArtT5XXL(sd1_clip.SD1ClipModel):
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) # no padding
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1) # no padding
 class PixArtTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
--- a/comfy/text_encoders/sa_t5.py
+++ b/comfy/text_encoders/sa_t5.py
@@ -11,7 +11,7 @@ class T5BaseModel(sd1_clip.SDClipModel):
 class T5BaseTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=768, embedding_key='t5base', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=768, embedding_key='t5base', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128)
 class SAT5Tokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
--- a/comfy/text_encoders/sd2_clip.py
+++ b/comfy/text_encoders/sd2_clip.py
@@ -12,7 +12,7 @@ class SD2ClipHModel(sd1_clip.SDClipModel):
 class SD2ClipHTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024, embedding_key='clip_h', tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024)
 class SD2Tokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
--- a/Show More
+++ b/Show More
		`@@ -1,2 +0,0 @@`
			`.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation`
			`pause`