Allow running with non working

Add model management and database
- use sqlalchemy + alembic + sqlite for db - extract model data and previews - endpoints for db interactions - add tests
2025-03-28 11:46:05 +08:00 · 2025-03-28 11:39:56 +08:00
309 changed files with 273022 additions and 47053 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -63,12 +63,7 @@ except:
 print("checking out master branch")  # noqa: T201
 branch = repo.lookup_branch('master')
 if branch is None:
-    try:
-        ref = repo.lookup_reference('refs/remotes/origin/master')
-    except:
-        print("pulling.")  # noqa: T201
-        pull(repo)
-        ref = repo.lookup_reference('refs/remotes/origin/master')
+    ref = repo.lookup_reference('refs/remotes/origin/master')
    repo.checkout(ref)
    branch = repo.lookup_branch('master')
    if branch is None:
--- a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@@ -1,2 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
-pause
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -12,7 +12,7 @@ on:
        description: 'CUDA version'
        required: true
        type: string
-        default: "128"
+        default: "124"
      python_minor:
        description: 'Python minor version'
        required: true
@@ -22,7 +22,7 @@ on:
        description: 'Python patch version'
        required: true
        type: string
-        default: "10"
+        default: "8"


 jobs:
@@ -36,7 +36,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.git_tag }}
-          fetch-depth: 150
+          fetch-depth: 0
          persist-credentials: false
      - uses: actions/cache/restore@v4
        id: cache
@@ -70,7 +70,7 @@ jobs:
            cd ..

          git clone --depth 1 https://github.com/comfyanonymous/taesd
-          cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+          cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

          mkdir ComfyUI_windows_portable
          mv python_embeded ComfyUI_windows_portable
@@ -85,14 +85,12 @@ jobs:

          cd ..

-          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z

          cd ComfyUI_windows_portable
          python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu

-          python_embeded/python.exe -s ./update/update.py ComfyUI/
-
          ls

      - name: Upload binaries to release
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
@@ -28,4 +28,4 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
-          pip install -r requirements.txt
+          pip install -r requirements.txt
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -17,7 +17,7 @@ jobs:
        path: "ComfyUI"
    - uses: actions/setup-python@v4
      with:
-        python-version: '3.10'
+        python-version: '3.9'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@@ -18,7 +18,7 @@ jobs:
    - name: Set up Python      
      uses: actions/setup-python@v4
      with:
-        python-version: '3.12'
+        python-version: '3.10'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
--- a/.github/workflows/update-api-stubs.yml
+++ b/.github/workflows/update-api-stubs.yml
@@ -1,56 +0,0 @@
-name: Generate Pydantic Stubs from api.comfy.org
-
-on:
-  schedule:
-    - cron: '0 0 * * 1'
-  workflow_dispatch:
-
-jobs:
-  generate-models:
-    runs-on: ubuntu-latest
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-      
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install 'datamodel-code-generator[http]'
-          npm install @redocly/cli
-      
-      - name: Download OpenAPI spec
-        run: |
-          curl -o openapi.yaml https://api.comfy.org/openapi
-      
-      - name: Filter OpenAPI spec with Redocly
-        run: |
-          npx @redocly/cli bundle openapi.yaml --output filtered-openapi.yaml --config comfy_api_nodes/redocly.yaml --remove-unused-components
-      
-      - name: Generate API models
-        run: |
-          datamodel-codegen --use-subclass-enum --input filtered-openapi.yaml --output comfy_api_nodes/apis --output-model-type pydantic_v2.BaseModel
-      
-      - name: Check for changes
-        id: git-check
-        run: |
-          git diff --exit-code comfy_api_nodes/apis || echo "changes=true" >> $GITHUB_OUTPUT
-      
-      - name: Create Pull Request
-        if: steps.git-check.outputs.changes == 'true'
-        uses: peter-evans/create-pull-request@v5
-        with:
-          commit-message: 'chore: update API models from OpenAPI spec'
-          title: 'Update API models from api.comfy.org'
-          body: |
-            This PR updates the API models based on the latest api.comfy.org OpenAPI specification.
-            
-            Generated automatically by the a Github workflow.
-          branch: update-api-stubs
-          delete-branch: true
-          base: master
--- a/.github/workflows/update-frontend.yml
+++ b/.github/workflows/update-frontend.yml
@@ -0,0 +1,58 @@
+name: Update Frontend Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Frontend version to update to (e.g., 1.0.0)"
+        required: true
+        type: string
+
+jobs:
+  update-frontend:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout ComfyUI
+        uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install requirements
+        run: |
+          python -m pip install --upgrade pip
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install -r requirements.txt
+          pip install wait-for-it
+      # Frontend asset will be downloaded to ComfyUI/web_custom_versions/Comfy-Org_ComfyUI_frontend/{version}
+      - name: Start ComfyUI server
+        run: |
+          python main.py --cpu --front-end-version Comfy-Org/ComfyUI_frontend@${{ github.event.inputs.version }} 2>&1 | tee console_output.log &
+          wait-for-it --service 127.0.0.1:8188 -t 30
+      - name: Configure Git
+        run: |
+          git config --global user.name "GitHub Action"
+          git config --global user.email "action@github.com"
+      # Replace existing frontend content with the new version and remove .js.map files
+      # See https://github.com/Comfy-Org/ComfyUI_frontend/issues/2145 for why we remove .js.map files
+      - name: Update frontend content
+        run: |
+          rm -rf web/
+          cp -r web_custom_versions/Comfy-Org_ComfyUI_frontend/${{ github.event.inputs.version }} web/
+          rm web/**/*.js.map
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v7
+        with:
+          token: ${{ secrets.PR_BOT_PAT }}
+          commit-message: "Update frontend to v${{ github.event.inputs.version }}"
+          title: "Frontend Update: v${{ github.event.inputs.version }}"
+          body: |
+            Automated PR to update frontend content to version ${{ github.event.inputs.version }}
+
+            This PR was created automatically by the frontend update workflow.
+          branch: release-${{ github.event.inputs.version }}
+          base: master
+          labels: Frontend,dependencies
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -17,7 +17,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "124"

      python_minor:
        description: 'python minor version'
@@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "10"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "126"

      python_minor:
        description: 'python minor version'
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "2"
+        default: "1"
 #  push:
 #    branches:
 #      - master
@@ -34,7 +34,7 @@ jobs:
    steps:
        - uses: actions/checkout@v4
          with:
-            fetch-depth: 30
+            fetch-depth: 0
            persist-credentials: false
        - uses: actions/setup-python@v5
          with:
@@ -56,7 +56,7 @@ jobs:
            cd ..

            git clone --depth 1 https://github.com/comfyanonymous/taesd
-            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable_nightly_pytorch
            mv python_embeded ComfyUI_windows_portable_nightly_pytorch
@@ -74,7 +74,7 @@ jobs:
            pause" > ./update/update_comfyui_and_python_dependencies.bat
            cd ..

-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
            mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z

            cd ComfyUI_windows_portable_nightly_pytorch
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "124"

      python_minor:
        description: 'python minor version'
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "10"
+        default: "8"
 #  push:
 #    branches:
 #      - master
@@ -50,7 +50,7 @@ jobs:

        - uses: actions/checkout@v4
          with:
-            fetch-depth: 150
+            fetch-depth: 0
            persist-credentials: false
        - shell: bash
          run: |
@@ -67,7 +67,7 @@ jobs:
            cd ..

            git clone --depth 1 https://github.com/comfyanonymous/taesd
-            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+            cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable
            mv python_embeded ComfyUI_windows_portable
@@ -82,14 +82,12 @@ jobs:

            cd ..

-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
            mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z

            cd ComfyUI_windows_portable
            python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu

-            python_embeded/python.exe -s ./update/update.py ComfyUI/
-
            ls

        - name: Upload binaries to release
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,3 @@ venv/
 *.log
 web_custom_versions/
 .DS_Store
-openapi.yaml
-filtered-openapi.yaml
-uv.lock
--- a/27
+++ b/27
@@ -5,20 +5,19 @@
 # Inlined the team members for now.

 # Maintainers
-*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/requirements.txt @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/pyproject.toml @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink

 # Python web server
-/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
-/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
-/utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
+/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata

-# Node developers
-/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
-/comfy/comfy_types/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
+# Frontend assets
+/web/ @huchenlei @webfiltered @pythongosssss @yoland68 @robinjhuang
+
+# Extra nodes
+/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 <div align="center">

 # ComfyUI
-**The most powerful and modular visual AI engine and application.**
+**The most powerful and modular diffusion model GUI and backend.**


 [![Website][website-shield]][website-url]
@@ -31,23 +31,10 @@
 ![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
 </div>

-ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
+This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
+### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)

-## Get Started
-
-#### [Desktop Application](https://www.comfy.org/download)
- The easiest way to get started. 
- Available on Windows & macOS.
-
-#### [Windows Portable Package](#installing)
- Get the latest commits and completely portable.
- Available on Windows.
-
-#### [Manual Install](#manual-install-windows-linux)
-Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).
-
-## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
-See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
+### [Installing ComfyUI](#installing)

 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
@@ -60,20 +47,12 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
   - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
   - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
-   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
-   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
 - Video Models
   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
   - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
-   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/)
-   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
- Audio Models
-   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
-   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
- 3D Models
-   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
+- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@@ -100,23 +79,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith

 Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)

-## Release Process
-
-ComfyUI follows a weekly release cycle every Friday, with three interconnected repositories:
-
-1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
-   - Releases a new stable version (e.g., v0.7.0)
-   - Serves as the foundation for the desktop release
-
-2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
-   - Builds a new release using the latest stable core version
-   - Version numbers match the core release (e.g., Desktop v1.7.0 uses Core v1.7.0)
-
-3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
-   - Weekly frontend updates are merged into the core repository
-   - Features are frozen for the upcoming core release
-   - Development continues for the next release cycle
-
 ## Shortcuts

 | Keybind                            | Explanation                                                                                                        |
@@ -157,7 +119,7 @@ ComfyUI follows a weekly release cycle every Friday, with three interconnected r

 # Installing

-## Windows Portable
+## Windows

 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

@@ -175,18 +137,9 @@ See the [Config file](extra_model_paths.yaml.example) to set the search paths fo

 To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)

-
-## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)
-
-You can install and start ComfyUI using comfy-cli:
-```bash
-pip install comfy-cli
-comfy install
-```
-
 ## Manual Install (Windows, Linux)

-python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet.
+Note that some dependencies do not yet support python 3.13 so using 3.12 is recommended.

 Git clone this repo.

@@ -198,11 +151,11 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2```

-This is the command to install the nightly with ROCm 6.3 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.2 which might have some performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2.4```

 ### Intel GPUs (Windows and Linux)

@@ -232,11 +185,11 @@ Additional discussion and help can be found [here](https://github.com/comfyanony

 Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124```

-This is the command to install pytorch nightly instead which might have performance improvements.
+This is the command to install pytorch nightly instead which might have performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126```

 #### Troubleshooting

@@ -280,13 +233,6 @@ For models compatible with Ascend Extension for PyTorch (torch_npu). To get star
 3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
 4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.

-#### Cambricon MLUs
-
-For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a step-by-step guide tailored to your platform and installation method:
-
-1. Install the Cambricon CNToolkit by adhering to the platform-specific instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cntoolkit_3.7.2/cntoolkit_install_3.7.2/index.html)
-2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
-3. Launch ComfyUI by running `python main.py`

 # Running

@@ -343,8 +289,6 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w

 ## Support and dev channel

-[Discord](https://comfy.org/discord): Try the #help or #feedback channels.
-
 [Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source).

 See also: [https://www.comfy.org/](https://www.comfy.org/)
@@ -361,7 +305,7 @@ For any bugs, issues, or feature requests related to the frontend, please use th

 The new frontend is now the default for ComfyUI. However, please note:

-1. The frontend in the main ComfyUI repository is updated fortnightly.
+1. The frontend in the main ComfyUI repository is updated weekly.
 2. Daily releases are available in the separate frontend repository.

 To use the most up-to-date frontend version:
@@ -378,7 +322,7 @@ To use the most up-to-date frontend version:
   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
   ```

-This approach allows you to easily switch between the stable fortnightly release and the cutting-edge daily updates, or even specific versions for testing purposes.
+This approach allows you to easily switch between the stable weekly release and the cutting-edge daily updates, or even specific versions for testing purposes.

 ### Accessing the Legacy Frontend

--- a/alembic.ini
+++ b/alembic.ini
@@ -0,0 +1,119 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+# Use forward slashes (/) also on windows to provide an os agnostic path
+script_location = alembic_db
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
+# Any required deps can installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to ZoneInfo()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic_db/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic_db/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+# version_path_separator = newline
+#
+# Use os.pathsep. Default configuration used for new projects.
+version_path_separator = os
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = sqlite:///user/comfyui.db
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = check --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARNING
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARNING
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
--- a/alembic_db/README.md
+++ b/alembic_db/README.md
@@ -0,0 +1,3 @@
+## Generate new revision
+1. Update models in `/app/database/models.py`
+2. Run `alembic revision --autogenerate -m "{your message}"`
--- a/alembic_db/env.py
+++ b/alembic_db/env.py
@@ -0,0 +1,75 @@
+from logging.config import fileConfig
+
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+
+from alembic import context
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+from app.database.models import Base
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section, {}),
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection, target_metadata=target_metadata
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
--- a/alembic_db/script.py.mako
+++ b/alembic_db/script.py.mako
@@ -0,0 +1,28 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    ${downgrades if downgrades else "pass"}
--- a/alembic_db/versions/2fb22c4fff36_init.py
+++ b/alembic_db/versions/2fb22c4fff36_init.py
@@ -0,0 +1,58 @@
+"""init
+
+Revision ID: 2fb22c4fff36
+Revises: 
+Create Date: 2025-03-27 19:00:47.686079
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '2fb22c4fff36'
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('model',
+    sa.Column('type', sa.Text(), nullable=False),
+    sa.Column('path', sa.Text(), nullable=False),
+    sa.Column('title', sa.Text(), nullable=True),
+    sa.Column('description', sa.Text(), nullable=True),
+    sa.Column('architecture', sa.Text(), nullable=True),
+    sa.Column('hash', sa.Text(), nullable=True),
+    sa.Column('source_url', sa.Text(), nullable=True),
+    sa.Column('date_added', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True),
+    sa.PrimaryKeyConstraint('type', 'path')
+    )
+    op.create_table('tag',
+    sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
+    sa.Column('name', sa.Text(), nullable=False),
+    sa.PrimaryKeyConstraint('id'),
+    sa.UniqueConstraint('name')
+    )
+    op.create_table('model_tag',
+    sa.Column('model_type', sa.Text(), nullable=False),
+    sa.Column('model_path', sa.Text(), nullable=False),
+    sa.Column('tag_id', sa.Integer(), nullable=False),
+    sa.ForeignKeyConstraint(['model_type', 'model_path'], ['model.type', 'model.path'], ondelete='CASCADE'),
+    sa.ForeignKeyConstraint(['tag_id'], ['tag.id'], ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('model_type', 'model_path', 'tag_id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('model_tag')
+    op.drop_table('tag')
+    op.drop_table('model')
+    # ### end Alembic commands ###
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@@ -1,9 +1,9 @@
 from aiohttp import web
 from typing import Optional
-from folder_paths import folder_names_and_paths, get_directory_by_type
+from folder_paths import models_dir, user_directory, output_directory, folder_names_and_paths
+from api_server.services.file_service import FileService
 from api_server.services.terminal_service import TerminalService
 import app.logger
-import os

 class InternalRoutes:
    '''
@@ -15,10 +15,26 @@ class InternalRoutes:
    def __init__(self, prompt_server):
        self.routes: web.RouteTableDef = web.RouteTableDef()
        self._app: Optional[web.Application] = None
+        self.file_service = FileService({
+            "models": models_dir,
+            "user": user_directory,
+            "output": output_directory
+        })
        self.prompt_server = prompt_server
        self.terminal_service = TerminalService(prompt_server)

    def setup_routes(self):
+        @self.routes.get('/files')
+        async def list_files(request):
+            directory_key = request.query.get('directory', '')
+            try:
+                file_list = self.file_service.list_files(directory_key)
+                return web.json_response({"files": file_list})
+            except ValueError as e:
+                return web.json_response({"error": str(e)}, status=400)
+            except Exception as e:
+                return web.json_response({"error": str(e)}, status=500)
+
        @self.routes.get('/logs')
        async def get_logs(request):
            return web.json_response("".join([(l["t"] + " - " + l["m"]) for l in app.logger.get_logs()]))
@@ -51,20 +67,6 @@ class InternalRoutes:
                response[key] = folder_names_and_paths[key][0]
            return web.json_response(response)

-        @self.routes.get('/files/{directory_type}')
-        async def get_files(request: web.Request) -> web.Response:
-            directory_type = request.match_info['directory_type']
-            if directory_type not in ("output", "input", "temp"):
-                return web.json_response({"error": "Invalid directory type"}, status=400)
-
-            directory = get_directory_by_type(directory_type)
-            sorted_files = sorted(
-                (entry for entry in os.scandir(directory) if entry.is_file()),
-                key=lambda entry: -entry.stat().st_mtime
-            )
-            return web.json_response([entry.name for entry in sorted_files], status=200)
-
-
    def get_app(self):
        if self._app is None:
            self._app = web.Application()
--- a/api_server/services/file_service.py
+++ b/api_server/services/file_service.py
@@ -0,0 +1,13 @@
+from typing import Dict, List, Optional
+from api_server.utils.file_operations import FileSystemOperations, FileSystemItem
+
+class FileService:
+    def __init__(self, allowed_directories: Dict[str, str], file_system_ops: Optional[FileSystemOperations] = None):
+        self.allowed_directories: Dict[str, str] = allowed_directories
+        self.file_system_ops: FileSystemOperations = file_system_ops or FileSystemOperations()
+
+    def list_files(self, directory_key: str) -> List[FileSystemItem]:
+        if directory_key not in self.allowed_directories:
+            raise ValueError("Invalid directory key")
+        directory_path: str = self.allowed_directories[directory_key]
+        return self.file_system_ops.walk_directory(directory_path)
--- a/app/app_settings.py
+++ b/app/app_settings.py
@@ -9,14 +9,8 @@ class AppSettings():
        self.user_manager = user_manager

    def get_settings(self, request):
-        try:
-            file = self.user_manager.get_request_user_filepath(
-                request,
-                "comfy.settings.json"
-            )
-        except KeyError as e:
-            logging.error("User settings not found.")
-            raise web.HTTPUnauthorized() from e
+        file = self.user_manager.get_request_user_filepath(
+            request, "comfy.settings.json")
        if os.path.isfile(file):
            try:
                with open(file) as f:
--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@@ -4,142 +4,31 @@ import os
 import folder_paths
 import glob
 from aiohttp import web
-import json
-import logging
-from functools import lru_cache
-
-from utils.json_util import merge_json_recursive
-
-
-# Extra locale files to load into main.json
-EXTRA_LOCALE_FILES = [
-    "nodeDefs.json",
-    "commands.json",
-    "settings.json",
-]
-
-
-def safe_load_json_file(file_path: str) -> dict:
-    if not os.path.exists(file_path):
-        return {}
-
-    try:
-        with open(file_path, "r", encoding="utf-8") as f:
-            return json.load(f)
-    except json.JSONDecodeError:
-        logging.error(f"Error loading {file_path}")
-        return {}
-

 class CustomNodeManager:
-    @lru_cache(maxsize=1)
-    def build_translations(self):
-        """Load all custom nodes translations during initialization. Translations are
-        expected to be loaded from `locales/` folder.
-
-        The folder structure is expected to be the following:
-        - custom_nodes/
-            - custom_node_1/
-                - locales/
-                    - en/
-                        - main.json
-                        - commands.json
-                        - settings.json
-
-        returned translations are expected to be in the following format:
-        {
-            "en": {
-                "nodeDefs": {...},
-                "commands": {...},
-                "settings": {...},
-                ...{other main.json keys}
-            }
-        }
-        """
-
-        translations = {}
-
-        for folder in folder_paths.get_folder_paths("custom_nodes"):
-            # Sort glob results for deterministic ordering
-            for custom_node_dir in sorted(glob.glob(os.path.join(folder, "*/"))):
-                locales_dir = os.path.join(custom_node_dir, "locales")
-                if not os.path.exists(locales_dir):
-                    continue
-
-                for lang_dir in glob.glob(os.path.join(locales_dir, "*/")):
-                    lang_code = os.path.basename(os.path.dirname(lang_dir))
-
-                    if lang_code not in translations:
-                        translations[lang_code] = {}
-
-                    # Load main.json
-                    main_file = os.path.join(lang_dir, "main.json")
-                    node_translations = safe_load_json_file(main_file)
-
-                    # Load extra locale files
-                    for extra_file in EXTRA_LOCALE_FILES:
-                        extra_file_path = os.path.join(lang_dir, extra_file)
-                        key = extra_file.split(".")[0]
-                        json_data = safe_load_json_file(extra_file_path)
-                        if json_data:
-                            node_translations[key] = json_data
-
-                    if node_translations:
-                        translations[lang_code] = merge_json_recursive(
-                            translations[lang_code], node_translations
-                        )
-
-        return translations
-
+    """
+    Placeholder to refactor the custom node management features from ComfyUI-Manager.
+    Currently it only contains the custom workflow templates feature.
+    """
    def add_routes(self, routes, webapp, loadedModules):

-        example_workflow_folder_names = ["example_workflows", "example", "examples", "workflow", "workflows"]
-
        @routes.get("/workflow_templates")
        async def get_workflow_templates(request):
            """Returns a web response that contains the map of custom_nodes names and their associated workflow templates. The ones without templates are omitted."""
-
-            files = []
-
-            for folder in folder_paths.get_folder_paths("custom_nodes"):
-                for folder_name in example_workflow_folder_names:
-                    pattern = os.path.join(folder, f"*/{folder_name}/*.json")
-                    matched_files = glob.glob(pattern)
-                    files.extend(matched_files)
-
-            workflow_templates_dict = (
-                {}
-            )  # custom_nodes folder name -> example workflow names
+            files = [
+                file
+                for folder in folder_paths.get_folder_paths("custom_nodes")
+                for file in glob.glob(os.path.join(folder, '*/example_workflows/*.json'))
+            ]
+            workflow_templates_dict = {} # custom_nodes folder name -> example workflow names
            for file in files:
-                custom_nodes_name = os.path.basename(
-                    os.path.dirname(os.path.dirname(file))
-                )
+                custom_nodes_name = os.path.basename(os.path.dirname(os.path.dirname(file)))
                workflow_name = os.path.splitext(os.path.basename(file))[0]
-                workflow_templates_dict.setdefault(custom_nodes_name, []).append(
-                    workflow_name
-                )
+                workflow_templates_dict.setdefault(custom_nodes_name, []).append(workflow_name)
            return web.json_response(workflow_templates_dict)

        # Serve workflow templates from custom nodes.
        for module_name, module_dir in loadedModules:
-            for folder_name in example_workflow_folder_names:
-                workflows_dir = os.path.join(module_dir, folder_name)
-
-                if os.path.exists(workflows_dir):
-                    if folder_name != "example_workflows":
-                        logging.debug(
-                            "Found example workflow folder '%s' for custom node '%s', consider renaming it to 'example_workflows'",
-                            folder_name, module_name)
-
-                    webapp.add_routes(
-                        [
-                            web.static(
-                                "/api/workflow_templates/" + module_name, workflows_dir
-                            )
-                        ]
-                    )
-
-        @routes.get("/i18n")
-        async def get_i18n(request):
-            """Returns translations from all custom nodes' locales folders."""
-            return web.json_response(self.build_translations())
+            workflows_dir = os.path.join(module_dir, 'example_workflows')
+            if os.path.exists(workflows_dir):
+                webapp.add_routes([web.static('/api/workflow_templates/' + module_name, workflows_dir)])
--- a/app/database/db.py
+++ b/app/database/db.py
@@ -0,0 +1,118 @@
+import logging
+import os
+import shutil
+import sys
+from app.database.models import Tag
+from comfy.cli_args import args
+
+try:
+    import alembic
+    import sqlalchemy
+except ImportError as e:
+    req_path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "../..", "requirements.txt")
+    )
+    logging.error(
+        f"\n\n********** ERROR ***********\n\nRequirements are not installed ({e}). Please install the requirements.txt file by running:\n{sys.executable} -s -m pip install -r {req_path}\n\nIf you are on the portable package you can run: update\\update_comfyui.bat to solve this problem\n********** ERROR **********\n"
+    )
+    exit(-1)
+
+from alembic import command
+from alembic.config import Config
+from alembic.runtime.migration import MigrationContext
+from alembic.script import ScriptDirectory
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+Session = None
+
+
+def get_alembic_config():
+    root_path = os.path.join(os.path.dirname(__file__), "../..")
+    config_path = os.path.abspath(os.path.join(root_path, "alembic.ini"))
+    scripts_path = os.path.abspath(os.path.join(root_path, "alembic_db"))
+
+    config = Config(config_path)
+    config.set_main_option("script_location", scripts_path)
+    config.set_main_option("sqlalchemy.url", args.database_url)
+
+    return config
+
+
+def get_db_path():
+    url = args.database_url
+    if url.startswith("sqlite:///"):
+        return url.split("///")[1]
+    else:
+        raise ValueError(f"Unsupported database URL '{url}'.")
+
+
+def init_db():
+    db_url = args.database_url
+    logging.debug(f"Database URL: {db_url}")
+
+    config = get_alembic_config()
+
+    # Check if we need to upgrade
+    engine = create_engine(db_url)
+    conn = engine.connect()
+
+    context = MigrationContext.configure(conn)
+    current_rev = context.get_current_revision()
+
+    script = ScriptDirectory.from_config(config)
+    target_rev = script.get_current_head()
+
+    if current_rev != target_rev:
+        # Backup the database pre upgrade
+        db_path = get_db_path()
+        backup_path = db_path + ".bkp"
+        if os.path.exists(db_path):
+            shutil.copy(db_path, backup_path)
+        else:
+            backup_path = None
+
+        try:
+            command.upgrade(config, target_rev)
+            logging.info(f"Database upgraded from {current_rev} to {target_rev}")
+        except Exception as e:
+            if backup_path:
+                # Restore the database from backup if upgrade fails
+                shutil.copy(backup_path, db_path)
+                os.remove(backup_path)
+            logging.error(f"Error upgrading database: {e}")
+            raise e
+
+    global Session
+    Session = sessionmaker(bind=engine)
+
+    if not current_rev:
+        # Init db, populate models
+        from app.model_processor import model_processor
+
+        session = create_session()
+        model_processor.populate_models(session)
+
+        # populate tags
+        tags = (
+            "character",
+            "style",
+            "concept",
+            "clothing",
+            "pose",
+            "background",
+            "vehicle",
+            "object",
+            "animal",
+            "action",
+        )
+        for tag in tags:
+            session.add(Tag(name=tag))
+
+        session.commit()
+
+def can_create_session():
+    return Session is not None
+
+def create_session():
+    return Session()
--- a/app/database/models.py
+++ b/app/database/models.py
@@ -0,0 +1,76 @@
+from sqlalchemy import (
+    Column,
+    Integer,
+    Text,
+    DateTime,
+    Table,
+    ForeignKeyConstraint,
+)
+from sqlalchemy.orm import relationship, declarative_base
+from sqlalchemy.sql import func
+
+Base = declarative_base()
+
+
+def to_dict(obj):
+    fields = obj.__table__.columns.keys()
+    return {
+        field: (val.to_dict() if hasattr(val, "to_dict") else val)
+        for field in fields
+        if (val := getattr(obj, field))
+    }
+
+
+ModelTag = Table(
+    "model_tag",
+    Base.metadata,
+    Column(
+        "model_type",
+        Text,
+        primary_key=True,
+    ),
+    Column(
+        "model_path",
+        Text,
+        primary_key=True,
+    ),
+    Column("tag_id", Integer, primary_key=True),
+    ForeignKeyConstraint(
+        ["model_type", "model_path"], ["model.type", "model.path"], ondelete="CASCADE"
+    ),
+    ForeignKeyConstraint(["tag_id"], ["tag.id"], ondelete="CASCADE"),
+)
+
+
+class Model(Base):
+    __tablename__ = "model"
+
+    type = Column(Text, primary_key=True)
+    path = Column(Text, primary_key=True)
+    title = Column(Text)
+    description = Column(Text)
+    architecture = Column(Text)
+    hash = Column(Text)
+    source_url = Column(Text)
+    date_added = Column(DateTime, server_default=func.now())
+
+    # Relationship with tags
+    tags = relationship("Tag", secondary=ModelTag, back_populates="models")
+
+    def to_dict(self):
+        dict = to_dict(self)
+        dict["tags"] = [tag.to_dict() for tag in self.tags]
+        return dict
+
+
+class Tag(Base):
+    __tablename__ = "tag"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    name = Column(Text, nullable=False, unique=True)
+
+    # Relationship with models
+    models = relationship("Model", secondary=ModelTag, back_populates="tags")
+
+    def to_dict(self):
+        return to_dict(self)
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -3,69 +3,16 @@ import argparse
 import logging
 import os
 import re
-import sys
 import tempfile
 import zipfile
-import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import TypedDict, Optional
-from importlib.metadata import version

 import requests
 from typing_extensions import NotRequired
-
 from comfy.cli_args import DEFAULT_VERSION_STRING
-import app.logger
-
-# The path to the requirements.txt file
-req_path = Path(__file__).parents[1] / "requirements.txt"
-
-
-def frontend_install_warning_message():
-    """The warning message to display when the frontend version is not up to date."""
-
-    extra = ""
-    if sys.flags.no_user_site:
-        extra = "-s "
-    return f"""
-Please install the updated requirements.txt file by running:
-{sys.executable} {extra}-m pip install -r {req_path}
-
-This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
-
-If you are on the portable package you can run: update\\update_comfyui.bat to solve this problem
-""".strip()
-
-
-def check_frontend_version():
-    """Check if the frontend version is up to date."""
-
-    def parse_version(version: str) -> tuple[int, int, int]:
-        return tuple(map(int, version.split(".")))
-
-    try:
-        frontend_version_str = version("comfyui-frontend-package")
-        frontend_version = parse_version(frontend_version_str)
-        with open(req_path, "r", encoding="utf-8") as f:
-            required_frontend = parse_version(f.readline().split("=")[-1])
-        if frontend_version < required_frontend:
-            app.logger.log_startup_warning(
-                f"""
-________________________________________________________________________
-WARNING WARNING WARNING WARNING WARNING
-
-Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
-
-{frontend_install_warning_message()}
-________________________________________________________________________
-""".strip()
-            )
-        else:
-            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
-    except Exception as e:
-        logging.error(f"Failed to check frontend version: {e}")


 REQUEST_TIMEOUT = 10  # seconds
@@ -162,49 +109,9 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:


 class FrontendManager:
+    DEFAULT_FRONTEND_PATH = str(Path(__file__).parents[1] / "web")
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")

-    @classmethod
-    def default_frontend_path(cls) -> str:
-        try:
-            import comfyui_frontend_package
-
-            return str(importlib.resources.files(comfyui_frontend_package) / "static")
-        except ImportError:
-            logging.error(
-                f"""
-********** ERROR ***********
-
-comfyui-frontend-package is not installed.
-
-{frontend_install_warning_message()}
-
-********** ERROR ***********
-""".strip()
-            )
-            sys.exit(-1)
-
-    @classmethod
-    def templates_path(cls) -> str:
-        try:
-            import comfyui_workflow_templates
-
-            return str(
-                importlib.resources.files(comfyui_workflow_templates) / "templates"
-            )
-        except ImportError:
-            logging.error(
-                f"""
-********** ERROR ***********
-
-comfyui-workflow-templates is not installed.
-
-{frontend_install_warning_message()}
-
-********** ERROR ***********
-""".strip()
-            )
-
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
@@ -225,9 +132,7 @@ comfyui-workflow-templates is not installed.
        return match_result.group(1), match_result.group(2), match_result.group(3)

    @classmethod
-    def init_frontend_unsafe(
-        cls, version_string: str, provider: Optional[FrontEndProvider] = None
-    ) -> str:
+    def init_frontend_unsafe(cls, version_string: str, provider: Optional[FrontEndProvider] = None) -> str:
        """
        Initializes the frontend for the specified version.

@@ -243,26 +148,17 @@ comfyui-workflow-templates is not installed.
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
-            check_frontend_version()
-            return cls.default_frontend_path()
+            return cls.DEFAULT_FRONTEND_PATH

        repo_owner, repo_name, version = cls.parse_version_string(version_string)

        if version.startswith("v"):
-            expected_path = str(
-                Path(cls.CUSTOM_FRONTENDS_ROOT)
-                / f"{repo_owner}_{repo_name}"
-                / version.lstrip("v")
-            )
+            expected_path = str(Path(cls.CUSTOM_FRONTENDS_ROOT) / f"{repo_owner}_{repo_name}" / version.lstrip("v"))
            if os.path.exists(expected_path):
-                logging.info(
-                    f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}"
-                )
+                logging.info(f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}")
                return expected_path

-        logging.info(
-            f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub..."
-        )
+        logging.info(f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub...")

        provider = provider or FrontEndProvider(repo_owner, repo_name)
        release = provider.get_release(version)
@@ -305,5 +201,4 @@ comfyui-workflow-templates is not installed.
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
-            check_frontend_version()
-            return cls.default_frontend_path()
+            return cls.DEFAULT_FRONTEND_PATH
--- a/app/logger.py
+++ b/app/logger.py
@@ -82,17 +82,3 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
        logger.addHandler(stdout_handler)

    logger.addHandler(stream_handler)
-
-
-STARTUP_WARNINGS = []
-
-
-def log_startup_warning(msg):
-    logging.warning(msg)
-    STARTUP_WARNINGS.append(msg)
-
-
-def print_startup_warnings():
-    for s in STARTUP_WARNINGS:
-        logging.warning(s)
-    STARTUP_WARNINGS.clear()
--- a/app/model_manager.py
+++ b/app/model_manager.py
@@ -1,19 +1,30 @@
 from __future__ import annotations

 import os
-import base64
-import json
 import time
 import logging
+from app.database.db import create_session
 import folder_paths
-import glob
-import comfy.utils
 from aiohttp import web
 from PIL import Image
 from io import BytesIO
-from folder_paths import map_legacy, filter_files_extensions, filter_files_content_types
+from folder_paths import map_legacy, filter_files_extensions, get_full_path
+from app.database.models import Tag, Model
+from app.model_processor import get_model_previews, model_processor
+from utils.web import dumps
+from sqlalchemy.orm import joinedload
+import sqlalchemy.exc


+def bad_request(message: str):
+    return web.json_response({"error": message}, status=400)
+
+def missing_field(field: str):
+    return bad_request(f"{field} is required")
+
+def not_found(message: str):
+    return web.json_response({"error": message + " not found"}, status=404)
+
 class ModelFileManager:
    def __init__(self) -> None:
        self.cache: dict[str, tuple[list[dict], dict[str, float], float]] = {}
@@ -62,7 +73,7 @@ class ModelFileManager:
            folder = folders[0][path_index]
            full_filename = os.path.join(folder, filename)

-            previews = self.get_model_previews(full_filename)
+            previews = get_model_previews(full_filename)
            default_preview = previews[0] if len(previews) > 0 else None
            if default_preview is None or (isinstance(default_preview, str) and not os.path.isfile(default_preview)):
                return web.Response(status=404)
@@ -76,6 +87,183 @@ class ModelFileManager:
            except:
                return web.Response(status=404)

+        @routes.get("/v2/models")
+        async def get_models(request):
+            with create_session() as session:
+                model_path = request.query.get("path", None)
+                model_type = request.query.get("type", None)
+                query = session.query(Model).options(joinedload(Model.tags))
+                if model_path:
+                    query = query.filter(Model.path == model_path)
+                if model_type:
+                    query = query.filter(Model.type == model_type)
+                models = query.all()
+                if model_path and model_type:
+                    if len(models) == 0:
+                        return not_found("Model")
+                    return web.json_response(models[0].to_dict(), dumps=dumps)
+                
+                return web.json_response([model.to_dict() for model in models], dumps=dumps)
+
+        @routes.post("/v2/models")
+        async def add_model(request):
+            with create_session() as session:
+                data = await request.json()
+                model_type = data.get("type", None)
+                model_path = data.get("path", None)
+
+                if not model_type:
+                    return missing_field("type")
+                if not model_path:
+                    return missing_field("path")
+
+                tags = data.pop("tags", [])
+                fields = Model.metadata.tables["model"].columns.keys()
+
+                # Validate keys are valid model fields
+                for key in data.keys():
+                    if key not in fields:
+                        return bad_request(f"Invalid field: {key}")
+
+                # Validate file exists
+                if not get_full_path(model_type, model_path):
+                    return not_found(f"File '{model_type}/{model_path}'")
+
+                model = Model()
+                for field in fields:
+                    if field in data:
+                        setattr(model, field, data[field])
+
+                model.tags = session.query(Tag).filter(Tag.id.in_(tags)).all()
+                for tag in tags:
+                    if tag not in [t.id for t in model.tags]:
+                        return not_found(f"Tag '{tag}'")
+
+                try:
+                    session.add(model)
+                    session.commit()
+                except sqlalchemy.exc.IntegrityError as e:
+                    session.rollback()
+                    return bad_request(e.orig.args[0])
+
+                model_processor.run()
+
+                return web.json_response(model.to_dict(), dumps=dumps)
+            
+        @routes.delete("/v2/models")
+        async def delete_model(request):
+            with create_session() as session:
+                model_path = request.query.get("path", None)
+                model_type = request.query.get("type", None)
+                if not model_path:
+                    return missing_field("path")
+                if not model_type:
+                    return missing_field("type")
+                
+                full_path = get_full_path(model_type, model_path)
+                if full_path:
+                    return bad_request("Model file exists, please delete the file before deleting the model record.")
+
+                model = session.query(Model).filter(Model.path == model_path, Model.type == model_type).first()
+                if not model:
+                    return not_found("Model")
+                session.delete(model)
+                session.commit()
+                return web.Response(status=204)
+
+        @routes.get("/v2/tags")
+        async def get_tags(request):
+            with create_session() as session:
+                tags = session.query(Tag).all()
+                return web.json_response(
+                    [{"id": tag.id, "name": tag.name} for tag in tags]
+                )
+
+        @routes.post("/v2/tags")
+        async def create_tag(request):
+            with create_session() as session:
+                data = await request.json()
+                name = data.get("name", None)
+                if not name:
+                    return missing_field("name")
+                tag = Tag(name=name)
+                session.add(tag)
+                session.commit()
+                return web.json_response({"id": tag.id, "name": tag.name})
+            
+        @routes.delete("/v2/tags")
+        async def delete_tag(request):
+            with create_session() as session:
+                tag_id = request.query.get("id", None)
+                if not tag_id:
+                    return missing_field("id")
+                tag = session.query(Tag).filter(Tag.id == tag_id).first()
+                if not tag:
+                    return not_found("Tag")
+                session.delete(tag)
+                session.commit()
+                return web.Response(status=204)
+
+        @routes.post("/v2/models/tags")
+        async def add_model_tag(request):
+            with create_session() as session:
+                data = await request.json()
+                tag_id = data.get("tag", None)
+                model_path = data.get("path", None)
+                model_type = data.get("type", None)
+
+                if tag_id is None:
+                    return missing_field("tag")
+                if model_path is None:
+                    return missing_field("path")
+                if model_type is None:
+                    return missing_field("type")
+
+                try:
+                    tag_id = int(tag_id)
+                except ValueError:
+                    return bad_request("Invalid tag id")
+
+                tag = session.query(Tag).filter(Tag.id == tag_id).first()
+                model = session.query(Model).filter(Model.path == model_path, Model.type == model_type).first()
+                if not model:
+                    return not_found("Model")
+                model.tags.append(tag)
+                session.commit()
+                return web.json_response(model.to_dict(), dumps=dumps)
+
+        @routes.delete("/v2/models/tags")
+        async def delete_model_tag(request):
+            with create_session() as session:
+                tag_id = request.query.get("tag", None)
+                model_path = request.query.get("path", None)
+                model_type = request.query.get("type", None)
+
+                if tag_id is None:
+                    return missing_field("tag")
+                if model_path is None:
+                    return missing_field("path")
+                if model_type is None:
+                    return missing_field("type")
+                
+                try:
+                    tag_id = int(tag_id)
+                except ValueError:
+                    return bad_request("Invalid tag id")
+
+                model = session.query(Model).filter(Model.path == model_path, Model.type == model_type).first()
+                if not model:
+                    return not_found("Model")
+                model.tags = [tag for tag in model.tags if tag.id != tag_id]
+                session.commit()
+                return web.Response(status=204)
+        
+            
+
+        @routes.get("/v2/models/missing")
+        async def get_missing_models(request):
+            return web.json_response(model_processor.missing_models)
+
    def get_model_file_list(self, folder_name: str):
        folder_name = map_legacy(folder_name)
        folders = folder_paths.folder_names_and_paths[folder_name]
@@ -146,39 +334,5 @@ class ModelFileManager:

        return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter()

-    def get_model_previews(self, filepath: str) -> list[str | BytesIO]:
-        dirname = os.path.dirname(filepath)
-
-        if not os.path.exists(dirname):
-            return []
-
-        basename = os.path.splitext(filepath)[0]
-        match_files = glob.glob(f"{basename}.*", recursive=False)
-        image_files = filter_files_content_types(match_files, "image")
-        safetensors_file = next(filter(lambda x: x.endswith(".safetensors"), match_files), None)
-        safetensors_metadata = {}
-
-        result: list[str | BytesIO] = []
-
-        for filename in image_files:
-            _basename = os.path.splitext(filename)[0]
-            if _basename == basename:
-                result.append(filename)
-            if _basename == f"{basename}.preview":
-                result.append(filename)
-
-        if safetensors_file:
-            safetensors_filepath = os.path.join(dirname, safetensors_file)
-            header = comfy.utils.safetensors_header(safetensors_filepath, max_size=8*1024*1024)
-            if header:
-                safetensors_metadata = json.loads(header)
-        safetensors_images = safetensors_metadata.get("__metadata__", {}).get("ssmd_cover_images", None)
-        if safetensors_images:
-            safetensors_images = json.loads(safetensors_images)
-            for image in safetensors_images:
-                result.append(BytesIO(base64.b64decode(image)))
-
-        return result
-
    def __exit__(self, exc_type, exc_value, traceback):
        self.clear_cache()
--- a/app/model_processor.py
+++ b/app/model_processor.py
@@ -0,0 +1,263 @@
+import base64
+from datetime import datetime
+import glob
+import hashlib
+from io import BytesIO
+import json
+import logging
+import os
+import threading
+import time
+import comfy.utils
+from app.database.models import Model
+from app.database.db import create_session
+from comfy.cli_args import args
+from folder_paths import (
+    filter_files_content_types,
+    get_full_path,
+    folder_names_and_paths,
+    get_filename_list,
+)
+from PIL import Image
+from urllib import request
+
+
+def get_model_previews(
+    filepath: str, check_metadata: bool = True
+) -> list[str | BytesIO]:
+    dirname = os.path.dirname(filepath)
+
+    if not os.path.exists(dirname):
+        return []
+
+    basename = os.path.splitext(filepath)[0]
+    match_files = glob.glob(f"{basename}.*", recursive=False)
+    image_files = filter_files_content_types(match_files, "image")
+
+    result: list[str | BytesIO] = []
+
+    for filename in image_files:
+        _basename = os.path.splitext(filename)[0]
+        if _basename == basename:
+            result.append(filename)
+        if _basename == f"{basename}.preview":
+            result.append(filename)
+
+    if not check_metadata:
+        return result
+
+    safetensors_file = next(
+        filter(lambda x: x.endswith(".safetensors"), match_files), None
+    )
+    safetensors_metadata = {}
+
+    if safetensors_file:
+        safetensors_filepath = os.path.join(dirname, safetensors_file)
+        header = comfy.utils.safetensors_header(
+            safetensors_filepath, max_size=8 * 1024 * 1024
+        )
+        if header:
+            safetensors_metadata = json.loads(header)
+    safetensors_images = safetensors_metadata.get("__metadata__", {}).get(
+        "ssmd_cover_images", None
+    )
+    if safetensors_images:
+        safetensors_images = json.loads(safetensors_images)
+        for image in safetensors_images:
+            result.append(BytesIO(base64.b64decode(image)))
+
+    return result
+
+
+class ModelProcessor:
+    def __init__(self):
+        self._thread = None
+        self._lock = threading.Lock()
+        self._run = False
+        self.missing_models = []
+
+    def run(self):
+        if args.disable_model_processing:
+            return
+
+        if self._thread is None:
+            # Lock to prevent multiple threads from starting
+            with self._lock:
+                self._run = True
+                if self._thread is None:
+                    self._thread = threading.Thread(target=self._process_models)
+                    self._thread.daemon = True
+                    self._thread.start()
+
+    def populate_models(self, session):
+        # Ensure database state matches filesystem
+
+        existing_models = session.query(Model).all()
+
+        for folder_name in folder_names_and_paths.keys():
+            if folder_name == "custom_nodes" or folder_name == "configs":
+                continue
+            seen = set()
+            files = get_filename_list(folder_name)
+
+            for file in files:
+                if file in seen:
+                    logging.warning(f"Skipping duplicate named model: {file}")
+                    continue
+                seen.add(file)
+
+                existing_model = None
+                for model in existing_models:
+                    if model.path == file and model.type == folder_name:
+                        existing_model = model
+                        break
+
+                if existing_model:
+                    # Model already exists in db, remove from list and skip
+                    existing_models.remove(existing_model)
+                    continue
+
+                file_path = get_full_path(folder_name, file)
+
+                model = Model(
+                    path=file,
+                    type=folder_name,
+                    date_added=datetime.fromtimestamp(os.path.getctime(file_path)),
+                )
+                session.add(model)
+
+        for model in existing_models:
+            if not get_full_path(model.type, model.path):
+                logging.warning(f"Model {model.path} not found")
+                self.missing_models.append({"type": model.type, "path": model.path})
+
+        session.commit()
+
+    def _get_models(self, session):
+        models = session.query(Model).filter(Model.hash == None).all()
+        return models
+
+    def _process_file(self, model_path):
+        is_safetensors = model_path.endswith(".safetensors")
+        metadata = {}
+        h = hashlib.sha256()
+
+        with open(model_path, "rb", buffering=0) as f:
+            if is_safetensors:
+                # Read header length (8 bytes)
+                header_size_bytes = f.read(8)
+                header_len = int.from_bytes(header_size_bytes, "little")
+                h.update(header_size_bytes)
+
+                # Read header
+                header_bytes = f.read(header_len)
+                h.update(header_bytes)
+                try:
+                    metadata = json.loads(header_bytes)
+                except json.JSONDecodeError:
+                    pass
+
+            # Read rest of file
+            b = bytearray(128 * 1024)
+            mv = memoryview(b)
+            while n := f.readinto(mv):
+                h.update(mv[:n])
+
+        return h.hexdigest(), metadata
+
+    def _populate_info(self, model, metadata):
+        model.title = metadata.get("modelspec.title", None)
+        model.description = metadata.get("modelspec.description", None)
+        model.architecture = metadata.get("modelspec.architecture", None)
+
+    def _extract_image(self, model_path, metadata):
+        # check if image already exists
+        if len(get_model_previews(model_path, check_metadata=False)) > 0:
+            return
+
+        image_path = os.path.splitext(model_path)[0] + ".webp"
+        if os.path.exists(image_path):
+            return
+
+        cover_images = metadata.get("ssmd_cover_images", None)
+        image = None
+        if cover_images:
+            try:
+                cover_images = json.loads(cover_images)
+                if len(cover_images) > 0:
+                    image_data = cover_images[0]
+                    image = Image.open(BytesIO(base64.b64decode(image_data)))
+            except Exception as e:
+                logging.warning(
+                    f"Error extracting cover image for model {model_path}: {e}"
+                )
+
+        if not image:
+            thumbnail = metadata.get("modelspec.thumbnail", None)
+            if thumbnail:
+                try:
+                    response = request.urlopen(thumbnail)
+                    image = Image.open(response)
+                except Exception as e:
+                    logging.warning(
+                        f"Error extracting thumbnail for model {model_path}: {e}"
+                    )
+
+        if image:
+            image.thumbnail((512, 512))
+            image.save(image_path)
+            image.close()
+
+    def _process_models(self):
+        with create_session() as session:
+            checked = set()
+            self.populate_models(session)
+
+            while self._run:
+                self._run = False
+
+                models = self._get_models(session)
+
+                if len(models) == 0:
+                    break
+
+                for model in models:
+                    # prevent looping on the same model if it crashes
+                    if model.path in checked:
+                        continue
+
+                    checked.add(model.path)
+
+                    try:
+                        time.sleep(0)
+                        now = time.time()
+                        model_path = get_full_path(model.type, model.path)
+
+                        if not model_path:
+                            logging.warning(f"Model {model.path} not found")
+                            self.missing_models.append(model.path)
+                            continue
+
+                        logging.debug(f"Processing model {model_path}")
+                        hash, header = self._process_file(model_path)
+                        logging.debug(
+                            f"Processed model {model_path} in {time.time() - now} seconds"
+                        )
+                        model.hash = hash
+
+                        if header:
+                            metadata = header.get("__metadata__", None)
+
+                            if metadata:
+                                self._populate_info(model, metadata)
+                                self._extract_image(model_path, metadata)
+
+                        session.commit()
+                    except Exception as e:
+                        logging.error(f"Error processing model {model.path}: {e}")
+
+        with self._lock:
+            self._thread = None
+
+
+model_processor = ModelProcessor()
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -197,112 +197,6 @@ class UserManager():

            return web.json_response(results)

-        @routes.get("/v2/userdata")
-        async def list_userdata_v2(request):
-            """
-            List files and directories in a user's data directory.
-
-            This endpoint provides a structured listing of contents within a specified
-            subdirectory of the user's data storage.
-
-            Query Parameters:
-            - path (optional): The relative path within the user's data directory
-                               to list. Defaults to the root ('').
-
-            Returns:
-            - 400: If the requested path is invalid, outside the user's data directory, or is not a directory.
-            - 404: If the requested path does not exist.
-            - 403: If the user is invalid.
-            - 500: If there is an error reading the directory contents.
-            - 200: JSON response containing a list of file and directory objects.
-                   Each object includes:
-                   - name: The name of the file or directory.
-                   - type: 'file' or 'directory'.
-                   - path: The relative path from the user's data root.
-                   - size (for files): The size in bytes.
-                   - modified (for files): The last modified timestamp (Unix epoch).
-            """
-            requested_rel_path = request.rel_url.query.get('path', '')
-
-            # URL-decode the path parameter
-            try:
-                requested_rel_path = parse.unquote(requested_rel_path)
-            except Exception as e:
-                logging.warning(f"Failed to decode path parameter: {requested_rel_path}, Error: {e}")
-                return web.Response(status=400, text="Invalid characters in path parameter")
-
-
-            # Check user validity and get the absolute path for the requested directory
-            try:
-                 base_user_path = self.get_request_user_filepath(request, None, create_dir=False)
-
-                 if requested_rel_path:
-                     target_abs_path = self.get_request_user_filepath(request, requested_rel_path, create_dir=False)
-                 else:
-                     target_abs_path = base_user_path
-
-            except KeyError as e:
-                 # Invalid user detected by get_request_user_id inside get_request_user_filepath
-                 logging.warning(f"Access denied for user: {e}")
-                 return web.Response(status=403, text="Invalid user specified in request")
-
-
-            if not target_abs_path:
-                 # Path traversal or other issue detected by get_request_user_filepath
-                 return web.Response(status=400, text="Invalid path requested")
-
-            # Handle cases where the user directory or target path doesn't exist
-            if not os.path.exists(target_abs_path):
-                # Check if it's the base user directory that's missing (new user case)
-                if target_abs_path == base_user_path:
-                    # It's okay if the base user directory doesn't exist yet, return empty list
-                     return web.json_response([])
-                else:
-                    # A specific subdirectory was requested but doesn't exist
-                     return web.Response(status=404, text="Requested path not found")
-
-            if not os.path.isdir(target_abs_path):
-                 return web.Response(status=400, text="Requested path is not a directory")
-
-            results = []
-            try:
-                for root, dirs, files in os.walk(target_abs_path, topdown=True):
-                    # Process directories
-                    for dir_name in dirs:
-                        dir_path = os.path.join(root, dir_name)
-                        rel_path = os.path.relpath(dir_path, base_user_path).replace(os.sep, '/')
-                        results.append({
-                            "name": dir_name,
-                            "path": rel_path,
-                            "type": "directory"
-                        })
-
-                    # Process files
-                    for file_name in files:
-                        file_path = os.path.join(root, file_name)
-                        rel_path = os.path.relpath(file_path, base_user_path).replace(os.sep, '/')
-                        entry_info = {
-                            "name": file_name,
-                            "path": rel_path,
-                            "type": "file"
-                        }
-                        try:
-                            stats = os.stat(file_path) # Use os.stat for potentially better performance with os.walk
-                            entry_info["size"] = stats.st_size
-                            entry_info["modified"] = stats.st_mtime
-                        except OSError as stat_error:
-                            logging.warning(f"Could not stat file {file_path}: {stat_error}")
-                            pass # Include file with available info
-                        results.append(entry_info)
-            except OSError as e:
-                logging.error(f"Error listing directory {target_abs_path}: {e}")
-                return web.Response(status=500, text="Error reading directory contents")
-
-            # Sort results alphabetically, directories first then files
-            results.sort(key=lambda x: (x['type'] != 'directory', x['name'].lower()))
-
-            return web.json_response(results)
-
        def get_user_data_path(request, check_exists = False, param = "file"):
            file = request.match_info.get(param, None)
            if not file:
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,6 +1,7 @@
 import argparse
 import enum
 import os
+from typing import Optional
 import comfy.options


@@ -42,11 +43,10 @@ parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certific
 parser.add_argument("--enable-cors-header", type=str, default=None, metavar="ORIGIN", nargs="?", const="*", help="Enable CORS (Cross-Origin Resource Sharing) with optional origin or allow all with default '*'.")
 parser.add_argument("--max-upload-size", type=float, default=100, help="Set the maximum upload size in MB.")

-parser.add_argument("--base-directory", type=str, default=None, help="Set the ComfyUI base directory for models, custom_nodes, input, output, temp, and user directories.")
 parser.add_argument("--extra-model-paths-config", type=str, default=None, metavar="PATH", nargs='+', action='append', help="Load one or more extra_model_paths.yaml files.")
-parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory. Overrides --base-directory.")
-parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory). Overrides --base-directory.")
-parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
+parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.")
+parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory).")
+parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
 parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
@@ -66,7 +66,6 @@ fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the diff
 fpunet_group.add_argument("--fp16-unet", action="store_true", help="Run the diffusion model in fp16")
 fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
 fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")
-fpunet_group.add_argument("--fp8_e8m0fnu-unet", action="store_true", help="Store unet weights in fp8_e8m0fnu.")

 fpvae_group = parser.add_mutually_exclusive_group()
 fpvae_group.add_argument("--fp16-vae", action="store_true", help="Run the VAE in fp16, might cause black images.")
@@ -80,7 +79,6 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor
 fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
 fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
 fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
-fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")

 parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")

@@ -102,14 +100,12 @@ parser.add_argument("--preview-size", type=int, default=512, help="Sets the maxi
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
-cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")

 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
 attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
 attn_group.add_argument("--use-sage-attention", action="store_true", help="Use sage attention.")
-attn_group.add_argument("--use-flash-attention", action="store_true", help="Use FlashAttention.")

 parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")

@@ -128,21 +124,12 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e

 parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")

-parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")

 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")

 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
-
-class PerformanceFeature(enum.Enum):
-    Fp16Accumulation = "fp16_accumulation"
-    Fp8MatrixMultiplication = "fp8_matrix_mult"
-    CublasOps = "cublas_ops"
-
-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
-
-parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
+parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")

 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
@@ -150,7 +137,6 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win

 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
-parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")

 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")

@@ -174,14 +160,13 @@ parser.add_argument(
    """,
 )

-def is_valid_directory(path: str) -> str:
-    """Validate if the given path is a directory, and check permissions."""
-    if not os.path.exists(path):
-        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
+def is_valid_directory(path: Optional[str]) -> Optional[str]:
+    """Validate if the given path is a directory."""
+    if path is None:
+        return None
+
    if not os.path.isdir(path):
-        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
-    if not os.access(path, os.R_OK):
-        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
+        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
    return path

 parser.add_argument(
@@ -191,16 +176,13 @@ parser.add_argument(
    help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.",
 )

-parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path. Overrides --base-directory.")
+parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path.")

-parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")
-
-parser.add_argument(
-    "--comfy-api-base",
-    type=str,
-    default="https://api.comfy.org",
-    help="Set the base URL for the ComfyUI API.  (default: https://api.comfy.org)",
+database_default_path = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "user", "comfyui.db") 
 )
+parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
+parser.add_argument("--disable-model-processing", action="store_true", help="Disable model file processing, e.g. computing hashes and extracting metadata.")

 if comfy.options.args_parsing:
    args = parser.parse_args()
@@ -212,17 +194,3 @@ if args.windows_standalone_build:

 if args.disable_auto_launch:
    args.auto_launch = False
-
-if args.force_fp16:
-    args.fp16_unet = True
-
-
-# '--fast' is not provided, use an empty set
-if args.fast is None:
-    args.fast = set()
-# '--fast' is provided with an empty list, enable all optimizations
-elif args.fast == []:
-    args.fast = set(PerformanceFeature)
-# '--fast' is provided with a list of performance features, use that list
-else:
-    args.fast = set(args.fast)
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -97,19 +97,14 @@ class CLIPTextModel_(torch.nn.Module):
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
-        if embeds is not None:
-            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
-        else:
-            x = self.embeddings(input_tokens, dtype=dtype)
-
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
-
-        causal_mask = torch.full((x.shape[1], x.shape[1]), -torch.finfo(x.dtype).max, dtype=x.dtype, device=x.device).triu_(1)
+            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))

+        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
        if mask is not None:
            mask += causal_mask
        else:
@@ -120,10 +115,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)

-        if num_tokens is not None:
-            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
-        else:
-            pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output

 class CLIPTextModel(torch.nn.Module):
@@ -211,15 +203,6 @@ class CLIPVision(torch.nn.Module):
            pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output

-class LlavaProjector(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, dtype, device, operations):
-        super().__init__()
-        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
-        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
-
 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
@@ -229,16 +212,7 @@ class CLIPVisionModelProjection(torch.nn.Module):
        else:
            self.visual_projection = lambda a: a

-        if "llava3" == config_dict.get("projector_type", None):
-            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
-        else:
-            self.multi_modal_projector = None
-
    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
        out = self.visual_projection(x[2])
-        projected = None
-        if self.multi_modal_projector is not None:
-            projected = self.multi_modal_projector(x[1])
-
-        return (x[0], x[1], out, projected)
+        return (x[0], x[1], out)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -9,7 +9,6 @@ import comfy.model_patcher
 import comfy.model_management
 import comfy.utils
 import comfy.clip_model
-import comfy.image_encoders.dino2

 class Output:
    def __getitem__(self, key):
@@ -18,7 +17,6 @@ class Output:
        setattr(self, key, item)

 def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
-    image = image[:, :, :, :3] if image.shape[3] > 3 else image
    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
    std = torch.tensor(std, device=image.device, dtype=image.dtype)
    image = image.movedim(-1, 1)
@@ -36,12 +34,6 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
    image = torch.clip((255. * image), 0, 255).round() / 255.0
    return (image - mean.view([3,1,1])) / std.view([3,1,1])

-IMAGE_ENCODERS = {
-    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
-    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
-    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
-}
-
 class ClipVisionModel():
    def __init__(self, json_config):
        with open(json_config) as f:
@@ -50,11 +42,10 @@ class ClipVisionModel():
        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
-        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
+        self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
        self.model.eval()

        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
@@ -74,7 +65,6 @@ class ClipVisionModel():
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
-        outputs["mm_projected"] = out[3]
        return outputs

 def convert_to_transformers(sd, prefix):
@@ -111,21 +101,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
-        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            if embed_shape == 729:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
-            elif embed_shape == 1024:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
-        elif embed_shape == 577:
-            if "multi_modal_projector.linear_1.bias" in sd:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
-            else:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
-    elif "embeddings.patch_embeddings.projection.weight" in sd:
-        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
    else:
        return None

--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@@ -1,19 +0,0 @@
-{
-  "attention_dropout": 0.0,
-  "dropout": 0.0,
-  "hidden_act": "quick_gelu",
-  "hidden_size": 1024,
-  "image_size": 336,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-5,
-  "model_type": "clip_vision_model",
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 24,
-  "patch_size": 14,
-  "projection_dim": 768,
-  "projector_type": "llava3",
-  "torch_dtype": "float32"
-}
--- a/comfy/clip_vision_siglip_512.json
+++ b/comfy/clip_vision_siglip_512.json
@@ -1,13 +0,0 @@
-{
-  "num_channels": 3,
-  "hidden_act": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
-  "image_size": 512,
-  "intermediate_size": 4304,
-  "model_type": "siglip_vision_model",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 27,
-  "patch_size": 16,
-  "image_mean": [0.5, 0.5, 0.5],
-  "image_std": [0.5, 0.5, 0.5]
-}
--- a/comfy/comfy_types/init.py
+++ b/comfy/comfy_types/init.py
@@ -1,6 +1,6 @@
 import torch
 from typing import Callable, Protocol, TypedDict, Optional, List
-from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin, FileLocator
+from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin


 class UnetApplyFunction(Protocol):
@@ -42,5 +42,4 @@ __all__ = [
    InputTypeDict.__name__,
    ComfyNodeABC.__name__,
    CheckLazyMixin.__name__,
-    FileLocator.__name__,
 ]
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@@ -1,8 +1,7 @@
 """Comfy-specific type hinting"""

 from __future__ import annotations
-from typing import Literal, TypedDict, Optional
-from typing_extensions import NotRequired
+from typing import Literal, TypedDict
 from abc import ABC, abstractmethod
 from enum import Enum

@@ -27,7 +26,6 @@ class IO(StrEnum):
    BOOLEAN = "BOOLEAN"
    INT = "INT"
    FLOAT = "FLOAT"
-    COMBO = "COMBO"
    CONDITIONING = "CONDITIONING"
    SAMPLER = "SAMPLER"
    SIGMAS = "SIGMAS"
@@ -48,7 +46,6 @@ class IO(StrEnum):
    FACE_ANALYSIS = "FACE_ANALYSIS"
    BBOX = "BBOX"
    SEGS = "SEGS"
-    VIDEO = "VIDEO"

    ANY = "*"
    """Always matches any type, but at a price.
@@ -70,148 +67,90 @@ class IO(StrEnum):
        return not (b.issubset(a) or a.issubset(b))


-class RemoteInputOptions(TypedDict):
-    route: str
-    """The route to the remote source."""
-    refresh_button: bool
-    """Specifies whether to show a refresh button in the UI below the widget."""
-    control_after_refresh: Literal["first", "last"]
-    """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on."""
-    timeout: int
-    """The maximum amount of time to wait for a response from the remote source in milliseconds."""
-    max_retries: int
-    """The maximum number of retries before aborting the request."""
-    refresh: int
-    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
-
-
-class MultiSelectOptions(TypedDict):
-    placeholder: NotRequired[str]
-    """The placeholder text to display in the multi-select widget when no items are selected."""
-    chip: NotRequired[bool]
-    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
-
-
 class InputTypeOptions(TypedDict):
    """Provides type hinting for the return type of the INPUT_TYPES node function.

    Due to IDE limitations with unions, for now all options are available for all types (e.g. `label_on` is hinted even when the type is not `IO.BOOLEAN`).

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_datatypes
    """

-    default: NotRequired[bool | str | float | int | list | tuple]
+    default: bool | str | float | int | list | tuple
    """The default value of the widget"""
-    defaultInput: NotRequired[bool]
-    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
-    - defaultInput on required inputs should be dropped.
-    - defaultInput on optional inputs should be replaced with forceInput.
-    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
-    """
-    forceInput: NotRequired[bool]
-    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
-    lazy: NotRequired[bool]
+    defaultInput: bool
+    """Defaults to an input slot rather than a widget"""
+    forceInput: bool
+    """`defaultInput` and also don't allow converting to a widget"""
+    lazy: bool
    """Declares that this input uses lazy evaluation"""
-    rawLink: NotRequired[bool]
+    rawLink: bool
    """When a link exists, rather than receiving the evaluated value, you will receive the link (i.e. `["nodeId", <outputIndex>]`). Designed for node expansion."""
-    tooltip: NotRequired[str]
+    tooltip: str
    """Tooltip for the input (or widget), shown on pointer hover"""
-    socketless: NotRequired[bool]
-    """All inputs (including widgets) have an input socket to connect links. When ``true``, if there is a widget for this input, no socket will be created.
-    Available from frontend v1.17.5
-    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3548
-    """
-    widgetType: NotRequired[str]
-    """Specifies a type to be used for widget initialization if different from the input type.
-    Available from frontend v1.18.0
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/3550"""
    # class InputTypeNumber(InputTypeOptions):
    # default: float | int
-    min: NotRequired[float]
+    min: float
    """The minimum value of a number (``FLOAT`` | ``INT``)"""
-    max: NotRequired[float]
+    max: float
    """The maximum value of a number (``FLOAT`` | ``INT``)"""
-    step: NotRequired[float]
+    step: float
    """The amount to increment or decrement a widget by when stepping up/down (``FLOAT`` | ``INT``)"""
-    round: NotRequired[float]
+    round: float
    """Floats are rounded by this value (``FLOAT``)"""
    # class InputTypeBoolean(InputTypeOptions):
    # default: bool
-    label_on: NotRequired[str]
+    label_on: str
    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
-    label_off: NotRequired[str]
+    label_on: str
    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
    # class InputTypeString(InputTypeOptions):
    # default: str
-    multiline: NotRequired[bool]
+    multiline: bool
    """Use a multiline text box (``STRING``)"""
-    placeholder: NotRequired[str]
+    placeholder: str
    """Placeholder text to display in the UI when empty (``STRING``)"""
    # Deprecated:
    # defaultVal: str
-    dynamicPrompts: NotRequired[bool]
+    dynamicPrompts: bool
    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
-    # class InputTypeCombo(InputTypeOptions):
-    image_upload: NotRequired[bool]
-    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
-    image_folder: NotRequired[Literal["input", "output", "temp"]]
-    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
-    """
-    remote: NotRequired[RemoteInputOptions]
-    """Specifies the configuration for a remote input.
-    Available after ComfyUI frontend v1.9.7
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
-    control_after_generate: NotRequired[bool]
-    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
-    options: NotRequired[list[str | int | float]]
-    """COMBO type only. Specifies the selectable options for the combo widget.
-    Prefer:
-    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
-    Over:
-    [["Option 1", "Option 2", "Option 3"]]
-    """
-    multi_select: NotRequired[MultiSelectOptions]
-    """COMBO type only. Specifies the configuration for a multi-select widget.
-    Available after ComfyUI frontend v1.13.4
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""


 class HiddenInputTypeDict(TypedDict):
    """Provides type hinting for the hidden entry of node INPUT_TYPES."""

-    node_id: NotRequired[Literal["UNIQUE_ID"]]
+    node_id: Literal["UNIQUE_ID"]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    unique_id: NotRequired[Literal["UNIQUE_ID"]]
+    unique_id: Literal["UNIQUE_ID"]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    prompt: NotRequired[Literal["PROMPT"]]
+    prompt: Literal["PROMPT"]
    """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description."""
-    extra_pnginfo: NotRequired[Literal["EXTRA_PNGINFO"]]
+    extra_pnginfo: Literal["EXTRA_PNGINFO"]
    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
-    dynprompt: NotRequired[Literal["DYNPROMPT"]]
+    dynprompt: Literal["DYNPROMPT"]
    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""


 class InputTypeDict(TypedDict):
    """Provides type hinting for node INPUT_TYPES.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_more_on_inputs
    """

-    required: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    required: dict[str, tuple[IO, InputTypeOptions]]
    """Describes all inputs that must be connected for the node to execute."""
-    optional: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    optional: dict[str, tuple[IO, InputTypeOptions]]
    """Describes inputs which do not need to be connected."""
-    hidden: NotRequired[HiddenInputTypeDict]
+    hidden: HiddenInputTypeDict
    """Offers advanced functionality and server-client communication.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_more_on_inputs#hidden-inputs
    """


 class ComfyNodeABC(ABC):
    """Abstract base class for Comfy nodes.  Includes the names and expected types of attributes.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview
    """

    DESCRIPTION: str
@@ -228,14 +167,12 @@ class ComfyNodeABC(ABC):
    CATEGORY: str
    """The category of the node, as per the "Add Node" menu.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#category
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#category
    """
    EXPERIMENTAL: bool
    """Flags a node as experimental, informing users that it may change or not work as expected."""
    DEPRECATED: bool
    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
-    API_NODE: Optional[bool]
-    """Flags a node as an API node."""

    @classmethod
    @abstractmethod
@@ -244,9 +181,9 @@ class ComfyNodeABC(ABC):

        * Must include the ``required`` key, which describes all inputs that must be connected for the node to execute.
        * The ``optional`` key can be added to describe inputs which do not need to be connected.
-        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/essentials/custom_node_more_on_inputs#hidden-inputs

-        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#input-types
+        Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#input-types
        """
        return {"required": {}}

@@ -261,7 +198,7 @@ class ComfyNodeABC(ABC):

    By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#output-node
    """
    INPUT_IS_LIST: bool
    """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.
@@ -272,9 +209,9 @@ class ComfyNodeABC(ABC):

    A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_lists#list-processing
    """
-    OUTPUT_IS_LIST: tuple[bool, ...]
+    OUTPUT_IS_LIST: tuple[bool]
    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.

    Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list.
@@ -290,29 +227,29 @@ class ComfyNodeABC(ABC):
    the node should provide a class attribute `OUTPUT_IS_LIST`, which is a ``tuple[bool]``, of the same length as `RETURN_TYPES`,
    specifying which outputs which should be so treated.

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_lists#list-processing
    """

-    RETURN_TYPES: tuple[IO, ...]
+    RETURN_TYPES: tuple[IO]
    """A tuple representing the outputs of this node.

    Usage::

        RETURN_TYPES = (IO.INT, "INT", "CUSTOM_TYPE")

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#return-types
    """
-    RETURN_NAMES: tuple[str, ...]
+    RETURN_NAMES: tuple[str]
    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#return-names
    """
-    OUTPUT_TOOLTIPS: tuple[str, ...]
+    OUTPUT_TOOLTIPS: tuple[str]
    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
    FUNCTION: str
    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`

-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#function
+    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#function
    """


@@ -330,19 +267,8 @@ class CheckLazyMixin:
        Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name).
        Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params).

-        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status
+        Comfy Docs: https://docs.comfy.org/essentials/custom_node_lazy_evaluation#defining-check-lazy-status
        """

        need = [name for name in kwargs if kwargs[name] is None]
        return need
-
-
-class FileLocator(TypedDict):
-    """Provides type hinting for the file location"""
-
-    filename: str
-    """The filename of the file."""
-    subfolder: str
-    """The subfolder of the file."""
-    type: Literal["input", "output", "temp"]
-    """The root folder of the file."""
--- a/comfy/conds.py
+++ b/comfy/conds.py
@@ -3,6 +3,9 @@ import math
 import comfy.utils


+def lcm(a, b): #TODO: eventually replace by math.lcm (added in python3.9)
+    return abs(a*b) // math.gcd(a, b)
+
 class CONDRegular:
    def __init__(self, cond):
        self.cond = cond
@@ -43,7 +46,7 @@ class CONDCrossAttn(CONDRegular):
            if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
                return False

-            mult_min = math.lcm(s1[1], s2[1])
+            mult_min = lcm(s1[1], s2[1])
            diff = mult_min // min(s1[1], s2[1])
            if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
                return False
@@ -54,7 +57,7 @@ class CONDCrossAttn(CONDRegular):
        crossattn_max_len = self.cond.shape[1]
        for x in others:
            c = x.cond
-            crossattn_max_len = math.lcm(crossattn_max_len, c.shape[1])
+            crossattn_max_len = lcm(crossattn_max_len, c.shape[1])
            conds.append(c)

        out = []
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -418,7 +418,10 @@ def controlnet_config(sd, model_options={}):
        weight_dtype = comfy.utils.weight_dtype(sd)

        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+        if weight_dtype is not None:
+            supported_inference_dtypes.append(weight_dtype)
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)

    load_device = comfy.model_management.get_torch_device()
    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
@@ -686,7 +689,10 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
        if supported_inference_dtypes is None:
            supported_inference_dtypes = [comfy.model_management.unet_dtype()]

-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+        if weight_dtype is not None:
+            supported_inference_dtypes.append(weight_dtype)
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)

    load_device = comfy.model_management.get_torch_device()

@@ -736,7 +742,6 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
    return control

 def load_controlnet(ckpt_path, model=None, model_options={}):
-    model_options = model_options.copy()
    if "global_average_pooling" not in model_options:
        filename = os.path.splitext(ckpt_path)[0]
        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
--- a/comfy/diffusers_convert.py
+++ b/comfy/diffusers_convert.py
@@ -4,6 +4,105 @@ import logging

 # conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py

+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3 * i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3 * i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3 * i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2 * j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+
+
 # ================#
 # VAE Conversion #
 # ================#
@@ -114,7 +213,6 @@ textenc_pattern = re.compile("|".join(protected.keys()))
 # Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
 code2idx = {"q": 0, "k": 1, "v": 2}

-
 # This function exists because at the time of writing torch.cat can't do fp8 with cuda
 def cat_tensors(tensors):
    x = 0
@@ -131,7 +229,6 @@ def cat_tensors(tensors):

    return out

-
 def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
    new_state_dict = {}
    capture_qkv_weight = {}
@@ -187,3 +284,5 @@ def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):

 def convert_text_enc_state_dict(text_enc_dict):
    return text_enc_dict
+
+
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@@ -661,7 +661,7 @@ class UniPC:

            if x_t is None:
                if use_predictor:
-                    pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
                else:
                    pred_res = 0
                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
@@ -669,7 +669,7 @@ class UniPC:
            if use_corrector:
                model_t = self.model_fn(x_t, t)
                if D1s is not None:
-                    corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
                else:
                    corr_res = 0
                D1_t = (model_t - model_prev_0)
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@@ -1,141 +0,0 @@
-import torch
-from comfy.text_encoders.bert import BertAttention
-import comfy.model_management
-from comfy.ldm.modules.attention import optimized_attention_for_device
-
-
-class Dino2AttentionOutput(torch.nn.Module):
-    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
-
-    def forward(self, x):
-        return self.dense(x)
-
-
-class Dino2AttentionBlock(torch.nn.Module):
-    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
-        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
-
-    def forward(self, x, mask, optimized_attention):
-        return self.output(self.attention(x, mask, optimized_attention))
-
-
-class LayerScale(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        self.lambda1 = torch.nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
-
-    def forward(self, x):
-        return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
-
-
-class SwiGLUFFN(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        in_features = out_features = dim
-        hidden_features = int(dim * 4)
-        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
-
-        self.weights_in = operations.Linear(in_features, 2 * hidden_features, bias=True, device=device, dtype=dtype)
-        self.weights_out = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.weights_in(x)
-        x1, x2 = x.chunk(2, dim=-1)
-        x = torch.nn.functional.silu(x1) * x2
-        return self.weights_out(x)
-
-
-class Dino2Block(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
-        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
-        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
-        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
-        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, x, optimized_attention):
-        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
-        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
-        return x
-
-
-class Dino2Encoder(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
-        super().__init__()
-        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
-
-    def forward(self, x, intermediate_output=None):
-        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
-
-        if intermediate_output is not None:
-            if intermediate_output < 0:
-                intermediate_output = len(self.layer) + intermediate_output
-
-        intermediate = None
-        for i, l in enumerate(self.layer):
-            x = l(x, optimized_attention)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        return x, intermediate
-
-
-class Dino2PatchEmbeddings(torch.nn.Module):
-    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.projection = operations.Conv2d(
-            in_channels=num_channels,
-            out_channels=dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=True,
-            dtype=dtype,
-            device=device
-        )
-
-    def forward(self, pixel_values):
-        return self.projection(pixel_values).flatten(2).transpose(1, 2)
-
-
-class Dino2Embeddings(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        patch_size = 14
-        image_size = 518
-
-        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
-        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
-        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
-        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
-
-    def forward(self, pixel_values):
-        x = self.patch_embeddings(pixel_values)
-        # TODO: mask_token?
-        x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
-        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
-        return x
-
-
-class Dinov2Model(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        num_layers = config_dict["num_hidden_layers"]
-        dim = config_dict["hidden_size"]
-        heads = config_dict["num_attention_heads"]
-        layer_norm_eps = config_dict["layer_norm_eps"]
-
-        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
-        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
-        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
-        x = self.embeddings(pixel_values)
-        x, i = self.encoder(x, intermediate_output=intermediate_output)
-        x = self.layernorm(x)
-        pooled_output = x[:, 0, :]
-        return x, i, pooled_output, None
--- a/comfy/image_encoders/dino2_giant.json
+++ b/comfy/image_encoders/dino2_giant.json
@@ -1,21 +0,0 @@
-{
-  "attention_probs_dropout_prob": 0.0,
-  "drop_path_rate": 0.0,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.0,
-  "hidden_size": 1536,
-  "image_size": 518,
-  "initializer_range": 0.02,
-  "layer_norm_eps": 1e-06,
-  "layerscale_value": 1.0,
-  "mlp_ratio": 4,
-  "model_type": "dinov2",
-  "num_attention_heads": 24,
-  "num_channels": 3,
-  "num_hidden_layers": 40,
-  "patch_size": 14,
-  "qkv_bias": true,
-  "use_swiglu_ffn": true,
-  "image_mean": [0.485, 0.456, 0.406],
-  "image_std": [0.229, 0.224, 0.225]
-}
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -40,7 +40,7 @@ def get_sigmas_polyexponential(n, sigma_min, sigma_max, rho=1., device='cpu'):
 def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
    """Constructs a continuous VP noise schedule."""
    t = torch.linspace(1, eps_s, n, device=device)
-    sigmas = torch.sqrt(torch.special.expm1(beta_d * t ** 2 / 2 + beta_min * t))
+    sigmas = torch.sqrt(torch.exp(beta_d * t ** 2 / 2 + beta_min * t) - 1)
    return append_zero(sigmas)


@@ -688,10 +688,10 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
    if len(sigmas) <= 1:
        return x

-    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    seed = extra_args.get("seed", None)
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@@ -762,10 +762,10 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if solver_type not in {'heun', 'midpoint'}:
        raise ValueError('solver_type must be \'heun\' or \'midpoint\'')

-    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

    old_denoised = None
@@ -808,10 +808,10 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if len(sigmas) <= 1:
        return x

-    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

    denoised_1, denoised_2 = None, None
@@ -858,7 +858,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
 def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
@@ -867,7 +867,7 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
@@ -876,7 +876,7 @@ def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
@@ -1267,7 +1267,7 @@ def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, dis
    return x

@torch.no_grad()
-def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, eta=1., cfg_pp=False):
+def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None, cfg_pp=False):
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
@@ -1277,7 +1277,6 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
    phi1_fn = lambda t: torch.expm1(t) / t
    phi2_fn = lambda t: (phi1_fn(t) - 1.0) / t

-    old_sigma_down = None
    old_denoised = None
    uncond_denoised = None
    def post_cfg_function(args):
@@ -1290,259 +1289,50 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
        extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)

    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
+        if s_churn > 0:
+            gamma = min(s_churn / (len(sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.0
+            sigma_hat = sigmas[i] * (gamma + 1)
+        else:
+            gamma = 0
+            sigma_hat = sigmas[i]
+
+        if gamma > 0:
+            eps = torch.randn_like(x) * s_noise
+            x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
+        denoised = model(x, sigma_hat * s_in, **extra_args)
        if callback is not None:
-            callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised})
-        if sigma_down == 0 or old_denoised is None:
+            callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigma_hat, "denoised": denoised})
+        if sigmas[i + 1] == 0 or old_denoised is None:
            # Euler method
            if cfg_pp:
-                d = to_d(x, sigmas[i], uncond_denoised)
-                x = denoised + d * sigma_down
+                d = to_d(x, sigma_hat, uncond_denoised)
+                x = denoised + d * sigmas[i + 1]
            else:
-                d = to_d(x, sigmas[i], denoised)
-                dt = sigma_down - sigmas[i]
+                d = to_d(x, sigma_hat, denoised)
+                dt = sigmas[i + 1] - sigma_hat
                x = x + d * dt
        else:
            # Second order multistep method in https://arxiv.org/pdf/2308.02157
-            t, t_old, t_next, t_prev = t_fn(sigmas[i]), t_fn(old_sigma_down), t_fn(sigma_down), t_fn(sigmas[i - 1])
+            t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigmas[i + 1]), t_fn(sigmas[i - 1])
            h = t_next - t
-            c2 = (t_prev - t_old) / h
+            c2 = (t_prev - t) / h

            phi1_val, phi2_val = phi1_fn(-h), phi2_fn(-h)
-            b1 = torch.nan_to_num(phi1_val - phi2_val / c2, nan=0.0)
-            b2 = torch.nan_to_num(phi2_val / c2, nan=0.0)
+            b1 = torch.nan_to_num(phi1_val - 1.0 / c2 * phi2_val, nan=0.0)
+            b2 = torch.nan_to_num(1.0 / c2 * phi2_val, nan=0.0)

            if cfg_pp:
                x = x + (denoised - uncond_denoised)
-                x = sigma_fn(h) * x + h * (b1 * uncond_denoised + b2 * old_denoised)
-            else:
-                x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised)

-        # Noise addition
-        if sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+            x = (sigma_fn(t_next) / sigma_fn(t)) * x + h * (b1 * denoised + b2 * old_denoised)

-        if cfg_pp:
-            old_denoised = uncond_denoised
-        else:
-            old_denoised = denoised
-        old_sigma_down = sigma_down
-    return x
-
-@torch.no_grad()
-def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=False)
-
-@torch.no_grad()
-def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=True)
-
-@torch.no_grad()
-def sample_res_multistep_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=False)
-
-@torch.no_grad()
-def sample_res_multistep_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=True)
-
-@torch.no_grad()
-def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2., cfg_pp=False):
-    """Gradient-estimation sampler. Paper: https://openreview.net/pdf?id=o2ND9v0CeK"""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    old_d = None
-
-    uncond_denoised = None
-    def post_cfg_function(args):
-        nonlocal uncond_denoised
-        uncond_denoised = args["uncond_denoised"]
-        return args["denoised"]
-
-    if cfg_pp:
-        model_options = extra_args.get("model_options", {}).copy()
-        extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if cfg_pp:
-            d = to_d(x, sigmas[i], uncond_denoised)
-        else:
-            d = to_d(x, sigmas[i], denoised)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        dt = sigmas[i + 1] - sigmas[i]
-        if i == 0:
-            # Euler method
-            if cfg_pp:
-                x = denoised + d * sigmas[i + 1]
-            else:
-                x = x + d * dt
-        else:
-            # Gradient estimation
-            if cfg_pp:
-                d_bar = (ge_gamma - 1) * (d - old_d)
-                x = denoised + d * sigmas[i + 1] + d_bar * dt
-            else:
-                d_bar = ge_gamma * d + (1 - ge_gamma) * old_d
-                x = x + d_bar * dt
-        old_d = d
-    return x
-
-@torch.no_grad()
-def sample_gradient_estimation_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.):
-    return sample_gradient_estimation(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, ge_gamma=ge_gamma, cfg_pp=True)
-
-@torch.no_grad()
-def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, noise_scaler=None, max_stage=3):
-    """
-    Extended Reverse-Time SDE solver (VE ER-SDE-Solver-3). Arxiv: https://arxiv.org/abs/2309.06169.
-    Code reference: https://github.com/QinpengCui/ER-SDE-Solver/blob/main/er_sde_solver.py.
-    """
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    def default_noise_scaler(sigma):
-        return sigma * ((sigma ** 0.3).exp() + 10.0)
-    noise_scaler = default_noise_scaler if noise_scaler is None else noise_scaler
-    num_integration_points = 200.0
-    point_indice = torch.arange(0, num_integration_points, dtype=torch.float32, device=x.device)
-
-    old_denoised = None
-    old_denoised_d = None
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        stage_used = min(max_stage, i + 1)
-        if sigmas[i + 1] == 0:
-            x = denoised
-        elif stage_used == 1:
-            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
-            x = r * x + (1 - r) * denoised
-        else:
-            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
-            x = r * x + (1 - r) * denoised
-
-            dt = sigmas[i + 1] - sigmas[i]
-            sigma_step_size = -dt / num_integration_points
-            sigma_pos = sigmas[i + 1] + point_indice * sigma_step_size
-            scaled_pos = noise_scaler(sigma_pos)
-
-            # Stage 2
-            s = torch.sum(1 / scaled_pos) * sigma_step_size
-            denoised_d = (denoised - old_denoised) / (sigmas[i] - sigmas[i - 1])
-            x = x + (dt + s * noise_scaler(sigmas[i + 1])) * denoised_d
-
-            if stage_used >= 3:
-                # Stage 3
-                s_u = torch.sum((sigma_pos - sigmas[i]) / scaled_pos) * sigma_step_size
-                denoised_u = (denoised_d - old_denoised_d) / ((sigmas[i] - sigmas[i - 2]) / 2)
-                x = x + ((dt ** 2) / 2 + s_u * noise_scaler(sigmas[i + 1])) * denoised_u
-            old_denoised_d = denoised_d
-
-        if s_noise != 0 and sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (sigmas[i + 1] ** 2 - sigmas[i] ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
        old_denoised = denoised
    return x

@torch.no_grad()
-def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
-    '''
-    SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 2
-    Arxiv: https://arxiv.org/abs/2305.14267
-    '''
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    inject_noise = eta > 0 and s_noise > 0
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            x = denoised
-        else:
-            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
-            h = t_next - t
-            h_eta = h * (eta + 1)
-            s = t + r * h
-            fac = 1 / (2 * r)
-            sigma_s = s.neg().exp()
-
-            coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1()
-            if inject_noise:
-                noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt()
-                noise_coeff_2 = ((-2 * r * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
-                noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s), noise_sampler(sigma_s, sigmas[i + 1])
-
-            # Step 1
-            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
-            if inject_noise:
-                x_2 = x_2 + sigma_s * (noise_coeff_1 * noise_1) * s_noise
-            denoised_2 = model(x_2, sigma_s * s_in, **extra_args)
-
-            # Step 2
-            denoised_d = (1 - fac) * denoised + fac * denoised_2
-            x = (coeff_2 + 1) * x - coeff_2 * denoised_d
-            if inject_noise:
-                x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
-    return x
+def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise, noise_sampler=noise_sampler, cfg_pp=False)

@torch.no_grad()
-def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
-    '''
-    SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 3
-    Arxiv: https://arxiv.org/abs/2305.14267
-    '''
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    inject_noise = eta > 0 and s_noise > 0
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            x = denoised
-        else:
-            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
-            h = t_next - t
-            h_eta = h * (eta + 1)
-            s_1 = t + r_1 * h
-            s_2 = t + r_2 * h
-            sigma_s_1, sigma_s_2 = s_1.neg().exp(), s_2.neg().exp()
-
-            coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1()
-            if inject_noise:
-                noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt()
-                noise_coeff_2 = ((-2 * r_1 * h * eta).expm1() - (-2 * r_2 * h * eta).expm1()).sqrt()
-                noise_coeff_3 = ((-2 * r_2 * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
-                noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1])
-
-            # Step 1
-            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
-            if inject_noise:
-                x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise
-            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
-
-            # Step 2
-            x_3 = (coeff_2 + 1) * x - coeff_2 * denoised + (r_2 / r_1) * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised)
-            if inject_noise:
-                x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
-            denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args)
-
-            # Step 3
-            x = (coeff_3 + 1) * x - coeff_3 * denoised + (1. / r_2) * (coeff_3 / h_eta + 1) * (denoised_3 - denoised)
-            if inject_noise:
-                x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise
-    return x
+def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise, noise_sampler=noise_sampler, cfg_pp=True)
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -407,66 +407,3 @@ class Cosmos1CV8x8x8(LatentFormat):
    ]

    latent_rgb_factors_bias = [-0.1223, -0.1889, -0.1976]
-
-class Wan21(LatentFormat):
-    latent_channels = 16
-    latent_dimensions = 3
-
-    latent_rgb_factors = [
-            [-0.1299, -0.1692,  0.2932],
-            [ 0.0671,  0.0406,  0.0442],
-            [ 0.3568,  0.2548,  0.1747],
-            [ 0.0372,  0.2344,  0.1420],
-            [ 0.0313,  0.0189, -0.0328],
-            [ 0.0296, -0.0956, -0.0665],
-            [-0.3477, -0.4059, -0.2925],
-            [ 0.0166,  0.1902,  0.1975],
-            [-0.0412,  0.0267, -0.1364],
-            [-0.1293,  0.0740,  0.1636],
-            [ 0.0680,  0.3019,  0.1128],
-            [ 0.0032,  0.0581,  0.0639],
-            [-0.1251,  0.0927,  0.1699],
-            [ 0.0060, -0.0633,  0.0005],
-            [ 0.3477,  0.2275,  0.2950],
-            [ 0.1984,  0.0913,  0.1861]
-        ]
-
-    latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
-
-    def __init__(self):
-        self.scale_factor = 1.0
-        self.latents_mean = torch.tensor([
-            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
-            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
-        ]).view(1, self.latent_channels, 1, 1, 1)
-        self.latents_std = torch.tensor([
-            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
-            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
-        ]).view(1, self.latent_channels, 1, 1, 1)
-
-
-        self.taesd_decoder_name = None #TODO
-
-    def process_in(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return (latent - latents_mean) * self.scale_factor / latents_std
-
-    def process_out(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return latent * latents_std / self.scale_factor + latents_mean
-
-class Hunyuan3Dv2(LatentFormat):
-    latent_channels = 64
-    latent_dimensions = 1
-    scale_factor = 0.9990943042622529
-
-class Hunyuan3Dv2mini(LatentFormat):
-    latent_channels = 64
-    latent_dimensions = 1
-    scale_factor = 1.0188137142395404
-
-class ACEAudio(LatentFormat):
-    latent_channels = 8
-    latent_dimensions = 2
--- a/comfy/ldm/ace/attention.py
+++ b/comfy/ldm/ace/attention.py
@@ -1,761 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/attention.py
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Tuple, Union, Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-import comfy.model_management
-from comfy.ldm.modules.attention import optimized_attention
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        query_dim: int,
-        cross_attention_dim: Optional[int] = None,
-        heads: int = 8,
-        kv_heads: Optional[int] = None,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias: bool = False,
-        qk_norm: Optional[str] = None,
-        added_kv_proj_dim: Optional[int] = None,
-        added_proj_bias: Optional[bool] = True,
-        out_bias: bool = True,
-        scale_qk: bool = True,
-        only_cross_attention: bool = False,
-        eps: float = 1e-5,
-        rescale_output_factor: float = 1.0,
-        residual_connection: bool = False,
-        processor=None,
-        out_dim: int = None,
-        out_context_dim: int = None,
-        context_pre_only=None,
-        pre_only=False,
-        elementwise_affine: bool = True,
-        is_causal: bool = False,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
-        self.query_dim = query_dim
-        self.use_bias = bias
-        self.is_cross_attention = cross_attention_dim is not None
-        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
-        self.rescale_output_factor = rescale_output_factor
-        self.residual_connection = residual_connection
-        self.dropout = dropout
-        self.fused_projections = False
-        self.out_dim = out_dim if out_dim is not None else query_dim
-        self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
-        self.context_pre_only = context_pre_only
-        self.pre_only = pre_only
-        self.is_causal = is_causal
-
-        self.scale_qk = scale_qk
-        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
-
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        # for slice_size > 0 the attention score computation
-        # is split across the batch axis to save memory
-        # You can set slice_size with `set_attention_slice`
-        self.sliceable_head_dim = heads
-
-        self.added_kv_proj_dim = added_kv_proj_dim
-        self.only_cross_attention = only_cross_attention
-
-        if self.added_kv_proj_dim is None and self.only_cross_attention:
-            raise ValueError(
-                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
-            )
-
-        self.group_norm = None
-        self.spatial_norm = None
-
-        self.norm_q = None
-        self.norm_k = None
-
-        self.norm_cross = None
-        self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device)
-
-        if not self.only_cross_attention:
-            # only relevant for the `AddedKVProcessor` classes
-            self.to_k = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
-            self.to_v = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
-        else:
-            self.to_k = None
-            self.to_v = None
-
-        self.added_proj_bias = added_proj_bias
-        if self.added_kv_proj_dim is not None:
-            self.add_k_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
-            self.add_v_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
-            if self.context_pre_only is not None:
-                self.add_q_proj = operations.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias, dtype=dtype, device=device)
-        else:
-            self.add_q_proj = None
-            self.add_k_proj = None
-            self.add_v_proj = None
-
-        if not self.pre_only:
-            self.to_out = nn.ModuleList([])
-            self.to_out.append(operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device))
-            self.to_out.append(nn.Dropout(dropout))
-        else:
-            self.to_out = None
-
-        if self.context_pre_only is not None and not self.context_pre_only:
-            self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device)
-        else:
-            self.to_add_out = None
-
-        self.norm_added_q = None
-        self.norm_added_k = None
-        self.processor = processor
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **cross_attention_kwargs,
-    ) -> torch.Tensor:
-        return self.processor(
-            self,
-            hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-
-
-class CustomLiteLAProcessor2_0:
-    """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE"""
-
-    def __init__(self):
-        self.kernel_func = nn.ReLU(inplace=False)
-        self.eps = 1e-15
-        self.pad_val = 1.0
-
-    def apply_rotary_emb(
-        self,
-        x: torch.Tensor,
-        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-        tensors contain rotary embeddings and are returned as real tensors.
-
-        Args:
-            x (`torch.Tensor`):
-                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-        """
-        cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
-        cos, sin = cos.to(x.device), sin.to(x.device)
-
-        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-
-        return out
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        *args,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        hidden_states_len = hidden_states.shape[1]
-
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        if encoder_hidden_states is not None:
-            context_input_ndim = encoder_hidden_states.ndim
-            if context_input_ndim == 4:
-                batch_size, channel, height, width = encoder_hidden_states.shape
-                encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size = hidden_states.shape[0]
-
-        # `sample` projections.
-        dtype = hidden_states.dtype
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-
-        # `context` projections.
-        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
-        if encoder_hidden_states is not None and has_encoder_hidden_state_proj:
-            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
-            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-
-            # attention
-            if not attn.is_cross_attention:
-                query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
-                key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
-                value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
-            else:
-                query = hidden_states
-                key = encoder_hidden_states
-                value = encoder_hidden_states
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
-        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
-        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
-
-        # RoPE需要 [B, H, S, D] 输入
-        # 此时 query是 [B, H, D, S], 需要转成 [B, H, S, D] 才能应用RoPE
-        query = query.permute(0, 1, 3, 2)  # [B, H, S, D]  (从 [B, H, D, S])
-
-        # Apply query and key normalization if needed
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        # Apply RoPE if needed
-        if rotary_freqs_cis is not None:
-            query = self.apply_rotary_emb(query, rotary_freqs_cis)
-            if not attn.is_cross_attention:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis)
-            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
-
-        # 此时 query是 [B, H, S, D]，需要还原成 [B, H, D, S]
-        query = query.permute(0, 1, 3, 2)  # [B, H, D, S]
-
-        if attention_mask is not None:
-            # attention_mask: [B, S] -> [B, 1, S, 1]
-            attention_mask = attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S, 1]
-            query = query * attention_mask.permute(0, 1, 3, 2)  # [B, H, S, D] * [B, 1, S, 1]
-            if not attn.is_cross_attention:
-                key = key * attention_mask  # key: [B, h, S, D] 与 mask [B, 1, S, 1] 相乘
-                value = value * attention_mask.permute(0, 1, 3, 2)  # 如果 value 是 [B, h, D, S]，那么需调整mask以匹配S维度
-
-        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
-            encoder_attention_mask = encoder_attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S_enc, 1]
-            # 此时 key: [B, h, S_enc, D], value: [B, h, D, S_enc]
-            key = key * encoder_attention_mask  # [B, h, S_enc, D] * [B, 1, S_enc, 1]
-            value = value * encoder_attention_mask.permute(0, 1, 3, 2)  # [B, h, D, S_enc] * [B, 1, 1, S_enc]
-
-        query = self.kernel_func(query)
-        key = self.kernel_func(key)
-
-        query, key, value = query.float(), key.float(), value.float()
-
-        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
-
-        vk = torch.matmul(value, key)
-
-        hidden_states = torch.matmul(vk, query)
-
-        if hidden_states.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.float()
-
-        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
-
-        hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
-
-        hidden_states = hidden_states.to(dtype)
-        if encoder_hidden_states is not None:
-            encoder_hidden_states = encoder_hidden_states.to(dtype)
-
-        # Split the attention outputs.
-        if encoder_hidden_states is not None and not attn.is_cross_attention and has_encoder_hidden_state_proj:
-            hidden_states, encoder_hidden_states = (
-                hidden_states[:, : hidden_states_len],
-                hidden_states[:, hidden_states_len:],
-            )
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if encoder_hidden_states is not None and not attn.context_pre_only and not attn.is_cross_attention and hasattr(attn, "to_add_out"):
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if encoder_hidden_states is not None and context_input_ndim == 4:
-            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if torch.get_autocast_gpu_dtype() == torch.float16:
-            hidden_states = hidden_states.clip(-65504, 65504)
-            if encoder_hidden_states is not None:
-                encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-
-        return hidden_states, encoder_hidden_states
-
-
-class CustomerAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-
-    def apply_rotary_emb(
-        self,
-        x: torch.Tensor,
-        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-        tensors contain rotary embeddings and are returned as real tensors.
-
-        Args:
-            x (`torch.Tensor`):
-                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-        """
-        cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
-        cos, sin = cos.to(x.device), sin.to(x.device)
-
-        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-
-        return out
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        *args,
-        **kwargs,
-    ) -> torch.Tensor:
-
-        residual = hidden_states
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        # Apply RoPE if needed
-        if rotary_freqs_cis is not None:
-            query = self.apply_rotary_emb(query, rotary_freqs_cis)
-            if not attn.is_cross_attention:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis)
-            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
-                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
-
-        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
-            # attention_mask: N x S1
-            # encoder_attention_mask: N x S2
-            # cross attention 整合attention_mask和encoder_attention_mask
-            combined_mask = attention_mask[:, :, None] * encoder_attention_mask[:, None, :]
-            attention_mask = torch.where(combined_mask == 1, 0.0, -torch.inf)
-            attention_mask = attention_mask[:, None, :, :].expand(-1, attn.heads, -1, -1).to(query.dtype)
-
-        elif not attn.is_cross_attention and attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        hidden_states = optimized_attention(
-            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True,
-        ).to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-def val2list(x: list or tuple or any, repeat_time=1) -> list:  # type: ignore
-    """Repeat `val` for `repeat_time` times and return the list or val if list/tuple."""
-    if isinstance(x, (list, tuple)):
-        return list(x)
-    return [x for _ in range(repeat_time)]
-
-
-def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple:  # type: ignore
-    """Return tuple with min_len by repeating element at idx_repeat."""
-    # convert to list first
-    x = val2list(x)
-
-    # repeat elements if necessary
-    if len(x) > 0:
-        x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
-
-    return tuple(x)
-
-
-def t2i_modulate(x, shift, scale):
-    return x * (1 + scale) + shift
-
-
-def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]:
-    if isinstance(kernel_size, tuple):
-        return tuple([get_same_padding(ks) for ks in kernel_size])
-    else:
-        assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number"
-        return kernel_size // 2
-
-class ConvLayer(nn.Module):
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        kernel_size=3,
-        stride=1,
-        dilation=1,
-        groups=1,
-        padding: Union[int, None] = None,
-        use_bias=False,
-        norm=None,
-        act=None,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        if padding is None:
-            padding = get_same_padding(kernel_size)
-            padding *= dilation
-
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-        self.padding = padding
-        self.use_bias = use_bias
-
-        self.conv = operations.Conv1d(
-            in_dim,
-            out_dim,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=use_bias,
-            device=device,
-            dtype=dtype
-        )
-        if norm is not None:
-            self.norm = operations.RMSNorm(out_dim, elementwise_affine=False, dtype=dtype, device=device)
-        else:
-            self.norm = None
-        if act is not None:
-            self.act = nn.SiLU(inplace=True)
-        else:
-            self.act = None
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.conv(x)
-        if self.norm:
-            x = self.norm(x)
-        if self.act:
-            x = self.act(x)
-        return x
-
-
-class GLUMBConv(nn.Module):
-    def __init__(
-        self,
-        in_features: int,
-        hidden_features: int,
-        out_feature=None,
-        kernel_size=3,
-        stride=1,
-        padding: Union[int, None] = None,
-        use_bias=False,
-        norm=(None, None, None),
-        act=("silu", "silu", None),
-        dilation=1,
-        dtype=None, device=None, operations=None
-    ):
-        out_feature = out_feature or in_features
-        super().__init__()
-        use_bias = val2tuple(use_bias, 3)
-        norm = val2tuple(norm, 3)
-        act = val2tuple(act, 3)
-
-        self.glu_act = nn.SiLU(inplace=False)
-        self.inverted_conv = ConvLayer(
-            in_features,
-            hidden_features * 2,
-            1,
-            use_bias=use_bias[0],
-            norm=norm[0],
-            act=act[0],
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.depth_conv = ConvLayer(
-            hidden_features * 2,
-            hidden_features * 2,
-            kernel_size,
-            stride=stride,
-            groups=hidden_features * 2,
-            padding=padding,
-            use_bias=use_bias[1],
-            norm=norm[1],
-            act=None,
-            dilation=dilation,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.point_conv = ConvLayer(
-            hidden_features,
-            out_feature,
-            1,
-            use_bias=use_bias[2],
-            norm=norm[2],
-            act=act[2],
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x.transpose(1, 2)
-        x = self.inverted_conv(x)
-        x = self.depth_conv(x)
-
-        x, gate = torch.chunk(x, 2, dim=1)
-        gate = self.glu_act(gate)
-        x = x * gate
-
-        x = self.point_conv(x)
-        x = x.transpose(1, 2)
-
-        return x
-
-
-class LinearTransformerBlock(nn.Module):
-    """
-    A Sana block with global shared adaptive layer norm (adaLN-single) conditioning.
-    """
-    def __init__(
-        self,
-        dim,
-        num_attention_heads,
-        attention_head_dim,
-        use_adaln_single=True,
-        cross_attention_dim=None,
-        added_kv_proj_dim=None,
-        context_pre_only=False,
-        mlp_ratio=4.0,
-        add_cross_attention=False,
-        add_cross_attention_dim=None,
-        qk_norm=None,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-
-        self.norm1 = operations.RMSNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.attn = Attention(
-            query_dim=dim,
-            cross_attention_dim=cross_attention_dim,
-            added_kv_proj_dim=added_kv_proj_dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            bias=True,
-            qk_norm=qk_norm,
-            processor=CustomLiteLAProcessor2_0(),
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        self.add_cross_attention = add_cross_attention
-        self.context_pre_only = context_pre_only
-
-        if add_cross_attention and add_cross_attention_dim is not None:
-            self.cross_attn = Attention(
-                query_dim=dim,
-                cross_attention_dim=add_cross_attention_dim,
-                added_kv_proj_dim=add_cross_attention_dim,
-                dim_head=attention_head_dim,
-                heads=num_attention_heads,
-                out_dim=dim,
-                context_pre_only=context_pre_only,
-                bias=True,
-                qk_norm=qk_norm,
-                processor=CustomerAttnProcessor2_0(),
-                dtype=dtype,
-                device=device,
-                operations=operations,
-            )
-
-        self.norm2 = operations.RMSNorm(dim, 1e-06, elementwise_affine=False)
-
-        self.ff = GLUMBConv(
-            in_features=dim,
-            hidden_features=int(dim * mlp_ratio),
-            use_bias=(True, True, False),
-            norm=(None, None, None),
-            act=("silu", "silu", None),
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.use_adaln_single = use_adaln_single
-        if use_adaln_single:
-            self.scale_shift_table = nn.Parameter(torch.empty(6, dim, dtype=dtype, device=device))
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: torch.FloatTensor = None,
-        attention_mask: torch.FloatTensor = None,
-        encoder_attention_mask: torch.FloatTensor = None,
-        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
-        temb: torch.FloatTensor = None,
-    ):
-
-        N = hidden_states.shape[0]
-
-        # step 1: AdaLN single
-        if self.use_adaln_single:
-            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-                comfy.model_management.cast_to(self.scale_shift_table[None], dtype=temb.dtype, device=temb.device) + temb.reshape(N, 6, -1)
-            ).chunk(6, dim=1)
-
-        norm_hidden_states = self.norm1(hidden_states)
-        if self.use_adaln_single:
-            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
-
-        # step 2: attention
-        if not self.add_cross_attention:
-            attn_output, encoder_hidden_states = self.attn(
-                hidden_states=norm_hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
-            )
-        else:
-            attn_output, _ = self.attn(
-                hidden_states=norm_hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=None,
-            )
-
-        if self.use_adaln_single:
-            attn_output = gate_msa * attn_output
-        hidden_states = attn_output + hidden_states
-
-        if self.add_cross_attention:
-            attn_output = self.cross_attn(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
-            )
-            hidden_states = attn_output + hidden_states
-
-        # step 3: add norm
-        norm_hidden_states = self.norm2(hidden_states)
-        if self.use_adaln_single:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-
-        # step 4: feed forward
-        ff_output = self.ff(norm_hidden_states)
-        if self.use_adaln_single:
-            ff_output = gate_mlp * ff_output
-
-        hidden_states = hidden_states + ff_output
-
-        return hidden_states
--- a/comfy/ldm/ace/lyric_encoder.py
+++ b/comfy/ldm/ace/lyric_encoder.py
--- a/comfy/ldm/ace/model.py
+++ b/comfy/ldm/ace/model.py
@@ -1,385 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/ace_step_transformer.py
-
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, List, Union
-
-import torch
-from torch import nn
-
-import comfy.model_management
-
-from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
-from .attention import LinearTransformerBlock, t2i_modulate
-from .lyric_encoder import ConformerEncoder as LyricEncoder
-
-
-def cross_norm(hidden_states, controlnet_input):
-    # input N x T x c
-    mean_hidden_states, std_hidden_states = hidden_states.mean(dim=(1,2), keepdim=True), hidden_states.std(dim=(1,2), keepdim=True)
-    mean_controlnet_input, std_controlnet_input = controlnet_input.mean(dim=(1,2), keepdim=True), controlnet_input.std(dim=(1,2), keepdim=True)
-    controlnet_input = (controlnet_input - mean_controlnet_input) * (std_hidden_states / (std_controlnet_input + 1e-12)) + mean_hidden_states
-    return controlnet_input
-
-
-# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, dtype=None, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=device).float() / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-class T2IFinalLayer(nn.Module):
-    """
-    The final layer of Sana.
-    """
-
-    def __init__(self, hidden_size, patch_size=[16, 1], out_channels=256, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm_final = operations.RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size[0] * patch_size[1] * out_channels, bias=True, dtype=dtype, device=device)
-        self.scale_shift_table = nn.Parameter(torch.empty(2, hidden_size, dtype=dtype, device=device))
-        self.out_channels = out_channels
-        self.patch_size = patch_size
-
-    def unpatchfy(
-        self,
-        hidden_states: torch.Tensor,
-        width: int,
-    ):
-        # 4 unpatchify
-        new_height, new_width = 1, hidden_states.size(1)
-        hidden_states = hidden_states.reshape(
-            shape=(hidden_states.shape[0], new_height, new_width, self.patch_size[0], self.patch_size[1], self.out_channels)
-        ).contiguous()
-        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
-        output = hidden_states.reshape(
-            shape=(hidden_states.shape[0], self.out_channels, new_height * self.patch_size[0], new_width * self.patch_size[1])
-        ).contiguous()
-        if width > new_width:
-            output = torch.nn.functional.pad(output, (0, width - new_width, 0, 0), 'constant', 0)
-        elif width < new_width:
-            output = output[:, :, :, :width]
-        return output
-
-    def forward(self, x, t, output_length):
-        shift, scale = (comfy.model_management.cast_to(self.scale_shift_table[None], device=t.device, dtype=t.dtype) + t[:, None]).chunk(2, dim=1)
-        x = t2i_modulate(self.norm_final(x), shift, scale)
-        x = self.linear(x)
-        # unpatchify
-        output = self.unpatchfy(x, output_length)
-        return output
-
-
-class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        height=16,
-        width=4096,
-        patch_size=(16, 1),
-        in_channels=8,
-        embed_dim=1152,
-        bias=True,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        patch_size_h, patch_size_w = patch_size
-        self.early_conv_layers = nn.Sequential(
-            operations.Conv2d(in_channels, in_channels*256, kernel_size=patch_size, stride=patch_size, padding=0, bias=bias, dtype=dtype, device=device),
-            operations.GroupNorm(num_groups=32, num_channels=in_channels*256, eps=1e-6, affine=True, dtype=dtype, device=device),
-            operations.Conv2d(in_channels*256, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias, dtype=dtype, device=device)
-        )
-        self.patch_size = patch_size
-        self.height, self.width = height // patch_size_h, width // patch_size_w
-        self.base_size = self.width
-
-    def forward(self, latent):
-        # early convolutions, N x C x H x W -> N x 256 * sqrt(patch_size) x H/patch_size x W/patch_size
-        latent = self.early_conv_layers(latent)
-        latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        return latent
-
-
-class ACEStepTransformer2DModel(nn.Module):
-    # _supports_gradient_checkpointing = True
-
-    def __init__(
-        self,
-        in_channels: Optional[int] = 8,
-        num_layers: int = 28,
-        inner_dim: int = 1536,
-        attention_head_dim: int = 64,
-        num_attention_heads: int = 24,
-        mlp_ratio: float = 4.0,
-        out_channels: int = 8,
-        max_position: int = 32768,
-        rope_theta: float = 1000000.0,
-        speaker_embedding_dim: int = 512,
-        text_embedding_dim: int = 768,
-        ssl_encoder_depths: List[int] = [9, 9],
-        ssl_names: List[str] = ["mert", "m-hubert"],
-        ssl_latent_dims: List[int] = [1024, 768],
-        lyric_encoder_vocab_size: int = 6681,
-        lyric_hidden_size: int = 1024,
-        patch_size: List[int] = [16, 1],
-        max_height: int = 16,
-        max_width: int = 4096,
-        audio_model=None,
-        dtype=None, device=None, operations=None
-
-    ):
-        super().__init__()
-
-        self.dtype = dtype
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        inner_dim = num_attention_heads * attention_head_dim
-        self.inner_dim = inner_dim
-        self.out_channels = out_channels
-        self.max_position = max_position
-        self.patch_size = patch_size
-
-        self.rope_theta = rope_theta
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            dim=self.attention_head_dim,
-            max_position_embeddings=self.max_position,
-            base=self.rope_theta,
-            dtype=dtype,
-            device=device,
-        )
-
-        # 2. Define input layers
-        self.in_channels = in_channels
-
-        self.num_layers = num_layers
-        # 3. Define transformers blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                LinearTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=self.num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    add_cross_attention=True,
-                    add_cross_attention_dim=self.inner_dim,
-                    dtype=dtype,
-                    device=device,
-                    operations=operations,
-                )
-                for i in range(self.num_layers)
-            ]
-        )
-
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.t_block = nn.Sequential(nn.SiLU(), operations.Linear(self.inner_dim, 6 * self.inner_dim, bias=True, dtype=dtype, device=device))
-
-        # speaker
-        self.speaker_embedder = operations.Linear(speaker_embedding_dim, self.inner_dim, dtype=dtype, device=device)
-
-        # genre
-        self.genre_embedder = operations.Linear(text_embedding_dim, self.inner_dim, dtype=dtype, device=device)
-
-        # lyric
-        self.lyric_embs = operations.Embedding(lyric_encoder_vocab_size, lyric_hidden_size, dtype=dtype, device=device)
-        self.lyric_encoder = LyricEncoder(input_size=lyric_hidden_size, static_chunk_size=0, dtype=dtype, device=device, operations=operations)
-        self.lyric_proj = operations.Linear(lyric_hidden_size, self.inner_dim, dtype=dtype, device=device)
-
-        projector_dim = 2 * self.inner_dim
-
-        self.projectors = nn.ModuleList([
-            nn.Sequential(
-                operations.Linear(self.inner_dim, projector_dim, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(projector_dim, projector_dim, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(projector_dim, ssl_dim, dtype=dtype, device=device),
-            ) for ssl_dim in ssl_latent_dims
-        ])
-
-        self.proj_in = PatchEmbed(
-            height=max_height,
-            width=max_width,
-            patch_size=patch_size,
-            embed_dim=self.inner_dim,
-            bias=True,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-
-        self.final_layer = T2IFinalLayer(self.inner_dim, patch_size=patch_size, out_channels=out_channels, dtype=dtype, device=device, operations=operations)
-
-    def forward_lyric_encoder(
-        self,
-        lyric_token_idx: Optional[torch.LongTensor] = None,
-        lyric_mask: Optional[torch.LongTensor] = None,
-        out_dtype=None,
-    ):
-        # N x T x D
-        lyric_embs = self.lyric_embs(lyric_token_idx, out_dtype=out_dtype)
-        prompt_prenet_out, _mask = self.lyric_encoder(lyric_embs, lyric_mask, decoding_chunk_size=1, num_decoding_left_chunks=-1)
-        prompt_prenet_out = self.lyric_proj(prompt_prenet_out)
-        return prompt_prenet_out
-
-    def encode(
-        self,
-        encoder_text_hidden_states: Optional[torch.Tensor] = None,
-        text_attention_mask: Optional[torch.LongTensor] = None,
-        speaker_embeds: Optional[torch.FloatTensor] = None,
-        lyric_token_idx: Optional[torch.LongTensor] = None,
-        lyric_mask: Optional[torch.LongTensor] = None,
-        lyrics_strength=1.0,
-    ):
-
-        bs = encoder_text_hidden_states.shape[0]
-        device = encoder_text_hidden_states.device
-
-        # speaker embedding
-        encoder_spk_hidden_states = self.speaker_embedder(speaker_embeds).unsqueeze(1)
-
-        # genre embedding
-        encoder_text_hidden_states = self.genre_embedder(encoder_text_hidden_states)
-
-        # lyric
-        encoder_lyric_hidden_states = self.forward_lyric_encoder(
-            lyric_token_idx=lyric_token_idx,
-            lyric_mask=lyric_mask,
-            out_dtype=encoder_text_hidden_states.dtype,
-        )
-
-        encoder_lyric_hidden_states *= lyrics_strength
-
-        encoder_hidden_states = torch.cat([encoder_spk_hidden_states, encoder_text_hidden_states, encoder_lyric_hidden_states], dim=1)
-
-        encoder_hidden_mask = None
-        if text_attention_mask is not None:
-            speaker_mask = torch.ones(bs, 1, device=device)
-            encoder_hidden_mask = torch.cat([speaker_mask, text_attention_mask, lyric_mask], dim=1)
-
-        return encoder_hidden_states, encoder_hidden_mask
-
-    def decode(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        encoder_hidden_mask: torch.Tensor,
-        timestep: Optional[torch.Tensor],
-        output_length: int = 0,
-        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
-        controlnet_scale: Union[float, torch.Tensor] = 1.0,
-    ):
-        embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
-        temb = self.t_block(embedded_timestep)
-
-        hidden_states = self.proj_in(hidden_states)
-
-        # controlnet logic
-        if block_controlnet_hidden_states is not None:
-            control_condi = cross_norm(hidden_states, block_controlnet_hidden_states)
-            hidden_states = hidden_states + control_condi * controlnet_scale
-
-        # inner_hidden_states = []
-
-        rotary_freqs_cis = self.rotary_emb(hidden_states, seq_len=hidden_states.shape[1])
-        encoder_rotary_freqs_cis = self.rotary_emb(encoder_hidden_states, seq_len=encoder_hidden_states.shape[1])
-
-        for index_block, block in enumerate(self.transformer_blocks):
-            hidden_states = block(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_hidden_mask,
-                rotary_freqs_cis=rotary_freqs_cis,
-                rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
-                temb=temb,
-            )
-
-        output = self.final_layer(hidden_states, embedded_timestep, output_length)
-        return output
-
-    def forward(
-        self,
-        x,
-        timestep,
-        attention_mask=None,
-        context: Optional[torch.Tensor] = None,
-        text_attention_mask: Optional[torch.LongTensor] = None,
-        speaker_embeds: Optional[torch.FloatTensor] = None,
-        lyric_token_idx: Optional[torch.LongTensor] = None,
-        lyric_mask: Optional[torch.LongTensor] = None,
-        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
-        controlnet_scale: Union[float, torch.Tensor] = 1.0,
-        lyrics_strength=1.0,
-        **kwargs
-    ):
-        hidden_states = x
-        encoder_text_hidden_states = context
-        encoder_hidden_states, encoder_hidden_mask = self.encode(
-            encoder_text_hidden_states=encoder_text_hidden_states,
-            text_attention_mask=text_attention_mask,
-            speaker_embeds=speaker_embeds,
-            lyric_token_idx=lyric_token_idx,
-            lyric_mask=lyric_mask,
-            lyrics_strength=lyrics_strength,
-        )
-
-        output_length = hidden_states.shape[-1]
-
-        output = self.decode(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_hidden_mask=encoder_hidden_mask,
-            timestep=timestep,
-            output_length=output_length,
-            block_controlnet_hidden_states=block_controlnet_hidden_states,
-            controlnet_scale=controlnet_scale,
-        )
-
-        return output
--- a/comfy/ldm/ace/vae/autoencoder_dc.py
+++ b/comfy/ldm/ace/vae/autoencoder_dc.py
@@ -1,644 +0,0 @@
-# Rewritten from diffusers
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Tuple, Union
-
-import comfy.model_management
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-
-class RMSNorm(ops.RMSNorm):
-    def __init__(self, dim, eps=1e-5, elementwise_affine=True, bias=False):
-        super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine)
-        if elementwise_affine:
-            self.bias = nn.Parameter(torch.empty(dim)) if bias else None
-
-    def forward(self, x):
-        x = super().forward(x)
-        if self.elementwise_affine:
-            if self.bias is not None:
-                x = x + comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device)
-        return x
-
-
-def get_normalization(norm_type, num_features, num_groups=32, eps=1e-5):
-    if norm_type == "batch_norm":
-        return nn.BatchNorm2d(num_features)
-    elif norm_type == "group_norm":
-        return ops.GroupNorm(num_groups, num_features)
-    elif norm_type == "layer_norm":
-        return ops.LayerNorm(num_features)
-    elif norm_type == "rms_norm":
-        return RMSNorm(num_features, eps=eps, elementwise_affine=True, bias=True)
-    else:
-        raise ValueError(f"Unknown normalization type: {norm_type}")
-
-
-def get_activation(activation_type):
-    if activation_type == "relu":
-        return nn.ReLU()
-    elif activation_type == "relu6":
-        return nn.ReLU6()
-    elif activation_type == "silu":
-        return nn.SiLU()
-    elif activation_type == "leaky_relu":
-        return nn.LeakyReLU(0.2)
-    else:
-        raise ValueError(f"Unknown activation type: {activation_type}")
-
-
-class ResBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        norm_type: str = "batch_norm",
-        act_fn: str = "relu6",
-    ) -> None:
-        super().__init__()
-
-        self.norm_type = norm_type
-        self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
-        self.conv1 = ops.Conv2d(in_channels, in_channels, 3, 1, 1)
-        self.conv2 = ops.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
-        self.norm = get_normalization(norm_type, out_channels)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        residual = hidden_states
-        hidden_states = self.conv1(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.norm_type == "rms_norm":
-            # move channel to the last dimension so we apply RMSnorm across channel dimension
-            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
-        else:
-            hidden_states = self.norm(hidden_states)
-
-        return hidden_states + residual
-
-class SanaMultiscaleAttentionProjection(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        num_attention_heads: int,
-        kernel_size: int,
-    ) -> None:
-        super().__init__()
-
-        channels = 3 * in_channels
-        self.proj_in = ops.Conv2d(
-            channels,
-            channels,
-            kernel_size,
-            padding=kernel_size // 2,
-            groups=channels,
-            bias=False,
-        )
-        self.proj_out = ops.Conv2d(channels, channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.proj_in(hidden_states)
-        hidden_states = self.proj_out(hidden_states)
-        return hidden_states
-
-class SanaMultiscaleLinearAttention(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_attention_heads: int = None,
-        attention_head_dim: int = 8,
-        mult: float = 1.0,
-        norm_type: str = "batch_norm",
-        kernel_sizes: tuple = (5,),
-        eps: float = 1e-15,
-        residual_connection: bool = False,
-    ):
-        super().__init__()
-
-        self.eps = eps
-        self.attention_head_dim = attention_head_dim
-        self.norm_type = norm_type
-        self.residual_connection = residual_connection
-
-        num_attention_heads = (
-            int(in_channels // attention_head_dim * mult)
-            if num_attention_heads is None
-            else num_attention_heads
-        )
-        inner_dim = num_attention_heads * attention_head_dim
-
-        self.to_q = ops.Linear(in_channels, inner_dim, bias=False)
-        self.to_k = ops.Linear(in_channels, inner_dim, bias=False)
-        self.to_v = ops.Linear(in_channels, inner_dim, bias=False)
-
-        self.to_qkv_multiscale = nn.ModuleList()
-        for kernel_size in kernel_sizes:
-            self.to_qkv_multiscale.append(
-                SanaMultiscaleAttentionProjection(inner_dim, num_attention_heads, kernel_size)
-            )
-
-        self.nonlinearity = nn.ReLU()
-        self.to_out = ops.Linear(inner_dim * (1 + len(kernel_sizes)), out_channels, bias=False)
-        self.norm_out = get_normalization(norm_type, out_channels)
-
-    def apply_linear_attention(self, query, key, value):
-        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1)
-        scores = torch.matmul(value, key.transpose(-1, -2))
-        hidden_states = torch.matmul(scores, query)
-
-        hidden_states = hidden_states.to(dtype=torch.float32)
-        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
-        return hidden_states
-
-    def apply_quadratic_attention(self, query, key, value):
-        scores = torch.matmul(key.transpose(-1, -2), query)
-        scores = scores.to(dtype=torch.float32)
-        scores = scores / (torch.sum(scores, dim=2, keepdim=True) + self.eps)
-        hidden_states = torch.matmul(value, scores.to(value.dtype))
-        return hidden_states
-
-    def forward(self, hidden_states):
-        height, width = hidden_states.shape[-2:]
-        if height * width > self.attention_head_dim:
-            use_linear_attention = True
-        else:
-            use_linear_attention = False
-
-        residual = hidden_states
-
-        batch_size, _, height, width = list(hidden_states.size())
-        original_dtype = hidden_states.dtype
-
-        hidden_states = hidden_states.movedim(1, -1)
-        query = self.to_q(hidden_states)
-        key = self.to_k(hidden_states)
-        value = self.to_v(hidden_states)
-        hidden_states = torch.cat([query, key, value], dim=3)
-        hidden_states = hidden_states.movedim(-1, 1)
-
-        multi_scale_qkv = [hidden_states]
-        for block in self.to_qkv_multiscale:
-            multi_scale_qkv.append(block(hidden_states))
-
-        hidden_states = torch.cat(multi_scale_qkv, dim=1)
-
-        if use_linear_attention:
-            # for linear attention upcast hidden_states to float32
-            hidden_states = hidden_states.to(dtype=torch.float32)
-
-        hidden_states = hidden_states.reshape(batch_size, -1, 3 * self.attention_head_dim, height * width)
-
-        query, key, value = hidden_states.chunk(3, dim=2)
-        query = self.nonlinearity(query)
-        key = self.nonlinearity(key)
-
-        if use_linear_attention:
-            hidden_states = self.apply_linear_attention(query, key, value)
-            hidden_states = hidden_states.to(dtype=original_dtype)
-        else:
-            hidden_states = self.apply_quadratic_attention(query, key, value)
-
-        hidden_states = torch.reshape(hidden_states, (batch_size, -1, height, width))
-        hidden_states = self.to_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
-
-        if self.norm_type == "rms_norm":
-            hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
-        else:
-            hidden_states = self.norm_out(hidden_states)
-
-        if self.residual_connection:
-            hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-class EfficientViTBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        mult: float = 1.0,
-        attention_head_dim: int = 32,
-        qkv_multiscales: tuple = (5,),
-        norm_type: str = "batch_norm",
-    ) -> None:
-        super().__init__()
-
-        self.attn = SanaMultiscaleLinearAttention(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            mult=mult,
-            attention_head_dim=attention_head_dim,
-            norm_type=norm_type,
-            kernel_sizes=qkv_multiscales,
-            residual_connection=True,
-        )
-
-        self.conv_out = GLUMBConv(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            norm_type="rms_norm",
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.attn(x)
-        x = self.conv_out(x)
-        return x
-
-
-class GLUMBConv(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        expand_ratio: float = 4,
-        norm_type: str = None,
-        residual_connection: bool = True,
-    ) -> None:
-        super().__init__()
-
-        hidden_channels = int(expand_ratio * in_channels)
-        self.norm_type = norm_type
-        self.residual_connection = residual_connection
-
-        self.nonlinearity = nn.SiLU()
-        self.conv_inverted = ops.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
-        self.conv_depth = ops.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
-        self.conv_point = ops.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
-
-        self.norm = None
-        if norm_type == "rms_norm":
-            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.residual_connection:
-            residual = hidden_states
-
-        hidden_states = self.conv_inverted(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-
-        hidden_states = self.conv_depth(hidden_states)
-        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
-        hidden_states = hidden_states * self.nonlinearity(gate)
-
-        hidden_states = self.conv_point(hidden_states)
-
-        if self.norm_type == "rms_norm":
-            # move channel to the last dimension so we apply RMSnorm across channel dimension
-            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
-
-        if self.residual_connection:
-            hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-def get_block(
-    block_type: str,
-    in_channels: int,
-    out_channels: int,
-    attention_head_dim: int,
-    norm_type: str,
-    act_fn: str,
-    qkv_mutliscales: tuple = (),
-):
-    if block_type == "ResBlock":
-        block = ResBlock(in_channels, out_channels, norm_type, act_fn)
-    elif block_type == "EfficientViTBlock":
-        block = EfficientViTBlock(
-            in_channels,
-            attention_head_dim=attention_head_dim,
-            norm_type=norm_type,
-            qkv_multiscales=qkv_mutliscales
-        )
-    else:
-        raise ValueError(f"Block with {block_type=} is not supported.")
-
-    return block
-
-
-class DCDownBlock2d(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
-        super().__init__()
-
-        self.downsample = downsample
-        self.factor = 2
-        self.stride = 1 if downsample else 2
-        self.group_size = in_channels * self.factor**2 // out_channels
-        self.shortcut = shortcut
-
-        out_ratio = self.factor**2
-        if downsample:
-            assert out_channels % out_ratio == 0
-            out_channels = out_channels // out_ratio
-
-        self.conv = ops.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=3,
-            stride=self.stride,
-            padding=1,
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        x = self.conv(hidden_states)
-        if self.downsample:
-            x = F.pixel_unshuffle(x, self.factor)
-
-        if self.shortcut:
-            y = F.pixel_unshuffle(hidden_states, self.factor)
-            y = y.unflatten(1, (-1, self.group_size))
-            y = y.mean(dim=2)
-            hidden_states = x + y
-        else:
-            hidden_states = x
-
-        return hidden_states
-
-
-class DCUpBlock2d(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        interpolate: bool = False,
-        shortcut: bool = True,
-        interpolation_mode: str = "nearest",
-    ) -> None:
-        super().__init__()
-
-        self.interpolate = interpolate
-        self.interpolation_mode = interpolation_mode
-        self.shortcut = shortcut
-        self.factor = 2
-        self.repeats = out_channels * self.factor**2 // in_channels
-
-        out_ratio = self.factor**2
-        if not interpolate:
-            out_channels = out_channels * out_ratio
-
-        self.conv = ops.Conv2d(in_channels, out_channels, 3, 1, 1)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.interpolate:
-            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
-            x = self.conv(x)
-        else:
-            x = self.conv(hidden_states)
-            x = F.pixel_shuffle(x, self.factor)
-
-        if self.shortcut:
-            y = hidden_states.repeat_interleave(self.repeats, dim=1, output_size=hidden_states.shape[1] * self.repeats)
-            y = F.pixel_shuffle(y, self.factor)
-            hidden_states = x + y
-        else:
-            hidden_states = x
-
-        return hidden_states
-
-
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        latent_channels: int,
-        attention_head_dim: int = 32,
-        block_type: str or tuple = "ResBlock",
-        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
-        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
-        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
-        downsample_block_type: str = "pixel_unshuffle",
-        out_shortcut: bool = True,
-    ):
-        super().__init__()
-
-        num_blocks = len(block_out_channels)
-
-        if isinstance(block_type, str):
-            block_type = (block_type,) * num_blocks
-
-        if layers_per_block[0] > 0:
-            self.conv_in = ops.Conv2d(
-                in_channels,
-                block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
-                kernel_size=3,
-                stride=1,
-                padding=1,
-            )
-        else:
-            self.conv_in = DCDownBlock2d(
-                in_channels=in_channels,
-                out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
-                downsample=downsample_block_type == "pixel_unshuffle",
-                shortcut=False,
-            )
-
-        down_blocks = []
-        for i, (out_channel, num_layers) in enumerate(zip(block_out_channels, layers_per_block)):
-            down_block_list = []
-
-            for _ in range(num_layers):
-                block = get_block(
-                    block_type[i],
-                    out_channel,
-                    out_channel,
-                    attention_head_dim=attention_head_dim,
-                    norm_type="rms_norm",
-                    act_fn="silu",
-                    qkv_mutliscales=qkv_multiscales[i],
-                )
-                down_block_list.append(block)
-
-            if i < num_blocks - 1 and num_layers > 0:
-                downsample_block = DCDownBlock2d(
-                    in_channels=out_channel,
-                    out_channels=block_out_channels[i + 1],
-                    downsample=downsample_block_type == "pixel_unshuffle",
-                    shortcut=True,
-                )
-                down_block_list.append(downsample_block)
-
-            down_blocks.append(nn.Sequential(*down_block_list))
-
-        self.down_blocks = nn.ModuleList(down_blocks)
-
-        self.conv_out = ops.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
-
-        self.out_shortcut = out_shortcut
-        if out_shortcut:
-            self.out_shortcut_average_group_size = block_out_channels[-1] // latent_channels
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.conv_in(hidden_states)
-        for down_block in self.down_blocks:
-            hidden_states = down_block(hidden_states)
-
-        if self.out_shortcut:
-            x = hidden_states.unflatten(1, (-1, self.out_shortcut_average_group_size))
-            x = x.mean(dim=2)
-            hidden_states = self.conv_out(hidden_states) + x
-        else:
-            hidden_states = self.conv_out(hidden_states)
-
-        return hidden_states
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        latent_channels: int,
-        attention_head_dim: int = 32,
-        block_type: str or tuple = "ResBlock",
-        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
-        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
-        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
-        norm_type: str or tuple = "rms_norm",
-        act_fn: str or tuple = "silu",
-        upsample_block_type: str = "pixel_shuffle",
-        in_shortcut: bool = True,
-    ):
-        super().__init__()
-
-        num_blocks = len(block_out_channels)
-
-        if isinstance(block_type, str):
-            block_type = (block_type,) * num_blocks
-        if isinstance(norm_type, str):
-            norm_type = (norm_type,) * num_blocks
-        if isinstance(act_fn, str):
-            act_fn = (act_fn,) * num_blocks
-
-        self.conv_in = ops.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
-
-        self.in_shortcut = in_shortcut
-        if in_shortcut:
-            self.in_shortcut_repeats = block_out_channels[-1] // latent_channels
-
-        up_blocks = []
-        for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
-            up_block_list = []
-
-            if i < num_blocks - 1 and num_layers > 0:
-                upsample_block = DCUpBlock2d(
-                    block_out_channels[i + 1],
-                    out_channel,
-                    interpolate=upsample_block_type == "interpolate",
-                    shortcut=True,
-                )
-                up_block_list.append(upsample_block)
-
-            for _ in range(num_layers):
-                block = get_block(
-                    block_type[i],
-                    out_channel,
-                    out_channel,
-                    attention_head_dim=attention_head_dim,
-                    norm_type=norm_type[i],
-                    act_fn=act_fn[i],
-                    qkv_mutliscales=qkv_multiscales[i],
-                )
-                up_block_list.append(block)
-
-            up_blocks.insert(0, nn.Sequential(*up_block_list))
-
-        self.up_blocks = nn.ModuleList(up_blocks)
-
-        channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
-
-        self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
-        self.conv_act = nn.ReLU()
-        self.conv_out = None
-
-        if layers_per_block[0] > 0:
-            self.conv_out = ops.Conv2d(channels, in_channels, 3, 1, 1)
-        else:
-            self.conv_out = DCUpBlock2d(
-                channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
-            )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.in_shortcut:
-            x = hidden_states.repeat_interleave(
-                self.in_shortcut_repeats, dim=1, output_size=hidden_states.shape[1] * self.in_shortcut_repeats
-            )
-            hidden_states = self.conv_in(hidden_states) + x
-        else:
-            hidden_states = self.conv_in(hidden_states)
-
-        for up_block in reversed(self.up_blocks):
-            hidden_states = up_block(hidden_states)
-
-        hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
-        hidden_states = self.conv_act(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
-
-
-class AutoencoderDC(nn.Module):
-    def __init__(
-        self,
-        in_channels: int = 2,
-        latent_channels: int = 8,
-        attention_head_dim: int = 32,
-        encoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
-        decoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
-        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
-        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
-        encoder_layers_per_block: Tuple[int] = (2, 2, 3, 3),
-        decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3),
-        encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
-        decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
-        upsample_block_type: str = "interpolate",
-        downsample_block_type: str = "Conv",
-        decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
-        decoder_act_fns: Union[str, Tuple[str]] = "silu",
-        scaling_factor: float = 0.41407,
-    ) -> None:
-        super().__init__()
-
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            latent_channels=latent_channels,
-            attention_head_dim=attention_head_dim,
-            block_type=encoder_block_types,
-            block_out_channels=encoder_block_out_channels,
-            layers_per_block=encoder_layers_per_block,
-            qkv_multiscales=encoder_qkv_multiscales,
-            downsample_block_type=downsample_block_type,
-        )
-
-        self.decoder = Decoder(
-            in_channels=in_channels,
-            latent_channels=latent_channels,
-            attention_head_dim=attention_head_dim,
-            block_type=decoder_block_types,
-            block_out_channels=decoder_block_out_channels,
-            layers_per_block=decoder_layers_per_block,
-            qkv_multiscales=decoder_qkv_multiscales,
-            norm_type=decoder_norm_types,
-            act_fn=decoder_act_fns,
-            upsample_block_type=upsample_block_type,
-        )
-
-        self.scaling_factor = scaling_factor
-        self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
-
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Internal encoding function."""
-        encoded = self.encoder(x)
-        return encoded * self.scaling_factor
-
-    def decode(self, z: torch.Tensor) -> torch.Tensor:
-        # Scale the latents back
-        z = z / self.scaling_factor
-        decoded = self.decoder(z)
-        return decoded
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        z = self.encode(x)
-        return self.decode(z)
-
--- a/comfy/ldm/ace/vae/music_dcae_pipeline.py
+++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py
@@ -1,109 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_dcae_pipeline.py
-import torch
-from .autoencoder_dc import AutoencoderDC
-import logging
-try:
-    import torchaudio
-except:
-    logging.warning("torchaudio missing, ACE model will be broken")
-
-import torchvision.transforms as transforms
-from .music_vocoder import ADaMoSHiFiGANV1
-
-
-class MusicDCAE(torch.nn.Module):
-    def __init__(self, source_sample_rate=None, dcae_config={}, vocoder_config={}):
-        super(MusicDCAE, self).__init__()
-
-        self.dcae = AutoencoderDC(**dcae_config)
-        self.vocoder = ADaMoSHiFiGANV1(**vocoder_config)
-
-        if source_sample_rate is None:
-            self.source_sample_rate = 48000
-        else:
-            self.source_sample_rate = source_sample_rate
-
-        # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
-
-        self.transform = transforms.Compose([
-            transforms.Normalize(0.5, 0.5),
-        ])
-        self.min_mel_value = -11.0
-        self.max_mel_value = 3.0
-        self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
-        self.mel_chunk_size = 1024
-        self.time_dimention_multiple = 8
-        self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple
-        self.scale_factor = 0.1786
-        self.shift_factor = -1.9091
-
-    def load_audio(self, audio_path):
-        audio, sr = torchaudio.load(audio_path)
-        return audio, sr
-
-    def forward_mel(self, audios):
-        mels = []
-        for i in range(len(audios)):
-            image = self.vocoder.mel_transform(audios[i])
-            mels.append(image)
-        mels = torch.stack(mels)
-        return mels
-
-    @torch.no_grad()
-    def encode(self, audios, audio_lengths=None, sr=None):
-        if audio_lengths is None:
-            audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
-            audio_lengths = audio_lengths.to(audios.device)
-
-        if sr is None:
-            sr = self.source_sample_rate
-
-        if sr != 44100:
-            audios = torchaudio.functional.resample(audios, sr, 44100)
-
-        max_audio_len = audios.shape[-1]
-        if max_audio_len % (8 * 512) != 0:
-            audios = torch.nn.functional.pad(audios, (0, 8 * 512 - max_audio_len % (8 * 512)))
-
-        mels = self.forward_mel(audios)
-        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
-        mels = self.transform(mels)
-        latents = []
-        for mel in mels:
-            latent = self.dcae.encoder(mel.unsqueeze(0))
-            latents.append(latent)
-        latents = torch.cat(latents, dim=0)
-        # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
-        latents = (latents - self.shift_factor) * self.scale_factor
-        return latents
-        # return latents, latent_lengths
-
-    @torch.no_grad()
-    def decode(self, latents, audio_lengths=None, sr=None):
-        latents = latents / self.scale_factor + self.shift_factor
-
-        pred_wavs = []
-
-        for latent in latents:
-            mels = self.dcae.decoder(latent.unsqueeze(0))
-            mels = mels * 0.5 + 0.5
-            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
-            wav = self.vocoder.decode(mels[0]).squeeze(1)
-
-            if sr is not None:
-                # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
-                wav = torchaudio.functional.resample(wav, 44100, sr)
-                # wav = resampler(wav)
-            else:
-                sr = 44100
-            pred_wavs.append(wav)
-
-        if audio_lengths is not None:
-            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
-        return torch.stack(pred_wavs)
-        # return sr, pred_wavs
-
-    def forward(self, audios, audio_lengths=None, sr=None):
-        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
-        sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr)
-        return sr, pred_wavs, latents, latent_lengths
--- a/comfy/ldm/ace/vae/music_log_mel.py
+++ b/comfy/ldm/ace/vae/music_log_mel.py
@@ -1,113 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_log_mel.py
-import torch
-import torch.nn as nn
-from torch import Tensor
-import logging
-try:
-    from torchaudio.transforms import MelScale
-except:
-    logging.warning("torchaudio missing, ACE model will be broken")
-
-import comfy.model_management
-
-class LinearSpectrogram(nn.Module):
-    def __init__(
-        self,
-        n_fft=2048,
-        win_length=2048,
-        hop_length=512,
-        center=False,
-        mode="pow2_sqrt",
-    ):
-        super().__init__()
-
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.center = center
-        self.mode = mode
-
-        self.register_buffer("window", torch.hann_window(win_length))
-
-    def forward(self, y: Tensor) -> Tensor:
-        if y.ndim == 3:
-            y = y.squeeze(1)
-
-        y = torch.nn.functional.pad(
-            y.unsqueeze(1),
-            (
-                (self.win_length - self.hop_length) // 2,
-                (self.win_length - self.hop_length + 1) // 2,
-            ),
-            mode="reflect",
-        ).squeeze(1)
-        dtype = y.dtype
-        spec = torch.stft(
-            y.float(),
-            self.n_fft,
-            hop_length=self.hop_length,
-            win_length=self.win_length,
-            window=comfy.model_management.cast_to(self.window, dtype=torch.float32, device=y.device),
-            center=self.center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-        spec = torch.view_as_real(spec)
-
-        if self.mode == "pow2_sqrt":
-            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
-        spec = spec.to(dtype)
-        return spec
-
-
-class LogMelSpectrogram(nn.Module):
-    def __init__(
-        self,
-        sample_rate=44100,
-        n_fft=2048,
-        win_length=2048,
-        hop_length=512,
-        n_mels=128,
-        center=False,
-        f_min=0.0,
-        f_max=None,
-    ):
-        super().__init__()
-
-        self.sample_rate = sample_rate
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.hop_length = hop_length
-        self.center = center
-        self.n_mels = n_mels
-        self.f_min = f_min
-        self.f_max = f_max or sample_rate // 2
-
-        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
-        self.mel_scale = MelScale(
-            self.n_mels,
-            self.sample_rate,
-            self.f_min,
-            self.f_max,
-            self.n_fft // 2 + 1,
-            "slaney",
-            "slaney",
-        )
-
-    def compress(self, x: Tensor) -> Tensor:
-        return torch.log(torch.clamp(x, min=1e-5))
-
-    def decompress(self, x: Tensor) -> Tensor:
-        return torch.exp(x)
-
-    def forward(self, x: Tensor, return_linear: bool = False) -> Tensor:
-        linear = self.spectrogram(x)
-        x = self.mel_scale(linear)
-        x = self.compress(x)
-        # print(x.shape)
-        if return_linear:
-            return x, self.compress(linear)
-
-        return x
--- a/comfy/ldm/ace/vae/music_vocoder.py
+++ b/comfy/ldm/ace/vae/music_vocoder.py
@@ -1,538 +0,0 @@
-# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_vocoder.py
-import torch
-from torch import nn
-
-from functools import partial
-from math import prod
-from typing import Callable, Tuple, List
-
-import numpy as np
-import torch.nn.functional as F
-from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
-
-from .music_log_mel import LogMelSpectrogram
-
-import comfy.model_management
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-
-def drop_path(
-    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
-):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-
-    """  # noqa: E501
-
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
-
-    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-
-    def extra_repr(self):
-        return f"drop_prob={round(self.drop_prob,3):0.3f}"
-
-
-class LayerNorm(nn.Module):
-    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
-    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
-    with shape (batch_size, channels, height, width).
-    """  # noqa: E501
-
-    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(normalized_shape))
-        self.bias = nn.Parameter(torch.zeros(normalized_shape))
-        self.eps = eps
-        self.data_format = data_format
-        if self.data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError
-        self.normalized_shape = (normalized_shape,)
-
-    def forward(self, x):
-        if self.data_format == "channels_last":
-            return F.layer_norm(
-                x, self.normalized_shape, comfy.model_management.cast_to(self.weight, dtype=x.dtype, device=x.device), comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device), self.eps
-            )
-        elif self.data_format == "channels_first":
-            u = x.mean(1, keepdim=True)
-            s = (x - u).pow(2).mean(1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.eps)
-            x = comfy.model_management.cast_to(self.weight[:, None], dtype=x.dtype, device=x.device) * x + comfy.model_management.cast_to(self.bias[:, None], dtype=x.dtype, device=x.device)
-            return x
-
-
-class ConvNeXtBlock(nn.Module):
-    r"""ConvNeXt Block. There are two equivalent implementations:
-    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
-    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
-    We use (2) as we find it slightly faster in PyTorch
-
-    Args:
-        dim (int): Number of input channels.
-        drop_path (float): Stochastic depth rate. Default: 0.0
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
-        kernel_size (int): Kernel size for depthwise conv. Default: 7.
-        dilation (int): Dilation for depthwise conv. Default: 1.
-    """  # noqa: E501
-
-    def __init__(
-        self,
-        dim: int,
-        drop_path: float = 0.0,
-        layer_scale_init_value: float = 1e-6,
-        mlp_ratio: float = 4.0,
-        kernel_size: int = 7,
-        dilation: int = 1,
-    ):
-        super().__init__()
-
-        self.dwconv = ops.Conv1d(
-            dim,
-            dim,
-            kernel_size=kernel_size,
-            padding=int(dilation * (kernel_size - 1) / 2),
-            groups=dim,
-        )  # depthwise conv
-        self.norm = LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = ops.Linear(
-            dim, int(mlp_ratio * dim)
-        )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = ops.Linear(int(mlp_ratio * dim), dim)
-        self.gamma = (
-            nn.Parameter(torch.empty((dim)), requires_grad=False)
-            if layer_scale_init_value > 0
-            else None
-        )
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0.0 else nn.Identity()
-
-    def forward(self, x, apply_residual: bool = True):
-        input = x
-
-        x = self.dwconv(x)
-        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-
-        if self.gamma is not None:
-            x = comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device) * x
-
-        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
-        x = self.drop_path(x)
-
-        if apply_residual:
-            x = input + x
-
-        return x
-
-
-class ParallelConvNeXtBlock(nn.Module):
-    def __init__(self, kernel_sizes: List[int], *args, **kwargs):
-        super().__init__()
-        self.blocks = nn.ModuleList(
-            [
-                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
-                for kernel_size in kernel_sizes
-            ]
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return torch.stack(
-            [block(x, apply_residual=False) for block in self.blocks] + [x],
-            dim=1,
-        ).sum(dim=1)
-
-
-class ConvNeXtEncoder(nn.Module):
-    def __init__(
-        self,
-        input_channels=3,
-        depths=[3, 3, 9, 3],
-        dims=[96, 192, 384, 768],
-        drop_path_rate=0.0,
-        layer_scale_init_value=1e-6,
-        kernel_sizes: Tuple[int] = (7,),
-    ):
-        super().__init__()
-        assert len(depths) == len(dims)
-
-        self.channel_layers = nn.ModuleList()
-        stem = nn.Sequential(
-            ops.Conv1d(
-                input_channels,
-                dims[0],
-                kernel_size=7,
-                padding=3,
-                padding_mode="replicate",
-            ),
-            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
-        )
-        self.channel_layers.append(stem)
-
-        for i in range(len(depths) - 1):
-            mid_layer = nn.Sequential(
-                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
-                ops.Conv1d(dims[i], dims[i + 1], kernel_size=1),
-            )
-            self.channel_layers.append(mid_layer)
-
-        block_fn = (
-            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
-            if len(kernel_sizes) == 1
-            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
-        )
-
-        self.stages = nn.ModuleList()
-        drop_path_rates = [
-            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
-        ]
-
-        cur = 0
-        for i in range(len(depths)):
-            stage = nn.Sequential(
-                *[
-                    block_fn(
-                        dim=dims[i],
-                        drop_path=drop_path_rates[cur + j],
-                        layer_scale_init_value=layer_scale_init_value,
-                    )
-                    for j in range(depths[i])
-                ]
-            )
-            self.stages.append(stage)
-            cur += depths[i]
-
-        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
-
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        for channel_layer, stage in zip(self.channel_layers, self.stages):
-            x = channel_layer(x)
-            x = stage(x)
-
-        return self.norm(x)
-
-
-def get_padding(kernel_size, dilation=1):
-    return (kernel_size * dilation - dilation) // 2
-
-
-class ResBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super().__init__()
-
-        self.convs1 = nn.ModuleList(
-            [
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-
-        self.convs2 = nn.ModuleList(
-            [
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.silu(x)
-            xt = c1(xt)
-            xt = F.silu(xt)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for conv in self.convs1:
-            remove_weight_norm(conv)
-        for conv in self.convs2:
-            remove_weight_norm(conv)
-
-
-class HiFiGANGenerator(nn.Module):
-    def __init__(
-        self,
-        *,
-        hop_length: int = 512,
-        upsample_rates: Tuple[int] = (8, 8, 2, 2, 2),
-        upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2),
-        resblock_kernel_sizes: Tuple[int] = (3, 7, 11),
-        resblock_dilation_sizes: Tuple[Tuple[int]] = (
-            (1, 3, 5), (1, 3, 5), (1, 3, 5)),
-        num_mels: int = 128,
-        upsample_initial_channel: int = 512,
-        use_template: bool = True,
-        pre_conv_kernel_size: int = 7,
-        post_conv_kernel_size: int = 7,
-        post_activation: Callable = partial(nn.SiLU, inplace=True),
-    ):
-        super().__init__()
-
-        assert (
-            prod(upsample_rates) == hop_length
-        ), f"hop_length must be {prod(upsample_rates)}"
-
-        self.conv_pre = torch.nn.utils.parametrizations.weight_norm(
-            ops.Conv1d(
-                num_mels,
-                upsample_initial_channel,
-                pre_conv_kernel_size,
-                1,
-                padding=get_padding(pre_conv_kernel_size),
-            )
-        )
-
-        self.num_upsamples = len(upsample_rates)
-        self.num_kernels = len(resblock_kernel_sizes)
-
-        self.noise_convs = nn.ModuleList()
-        self.use_template = use_template
-        self.ups = nn.ModuleList()
-
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            c_cur = upsample_initial_channel // (2 ** (i + 1))
-            self.ups.append(
-                torch.nn.utils.parametrizations.weight_norm(
-                    ops.ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-
-            if not use_template:
-                continue
-
-            if i + 1 < len(upsample_rates):
-                stride_f0 = np.prod(upsample_rates[i + 1:])
-                self.noise_convs.append(
-                    ops.Conv1d(
-                        1,
-                        c_cur,
-                        kernel_size=stride_f0 * 2,
-                        stride=stride_f0,
-                        padding=stride_f0 // 2,
-                    )
-                )
-            else:
-                self.noise_convs.append(ops.Conv1d(1, c_cur, kernel_size=1))
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
-                self.resblocks.append(ResBlock1(ch, k, d))
-
-        self.activation_post = post_activation()
-        self.conv_post = torch.nn.utils.parametrizations.weight_norm(
-            ops.Conv1d(
-                ch,
-                1,
-                post_conv_kernel_size,
-                1,
-                padding=get_padding(post_conv_kernel_size),
-            )
-        )
-
-    def forward(self, x, template=None):
-        x = self.conv_pre(x)
-
-        for i in range(self.num_upsamples):
-            x = F.silu(x, inplace=True)
-            x = self.ups[i](x)
-
-            if self.use_template:
-                x = x + self.noise_convs[i](template)
-
-            xs = None
-
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-
-            x = xs / self.num_kernels
-
-        x = self.activation_post(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-
-        return x
-
-    def remove_weight_norm(self):
-        for up in self.ups:
-            remove_weight_norm(up)
-        for block in self.resblocks:
-            block.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-
-
-class ADaMoSHiFiGANV1(nn.Module):
-    def __init__(
-        self,
-        input_channels: int = 128,
-        depths: List[int] = [3, 3, 9, 3],
-        dims: List[int] = [128, 256, 384, 512],
-        drop_path_rate: float = 0.0,
-        kernel_sizes: Tuple[int] = (7,),
-        upsample_rates: Tuple[int] = (4, 4, 2, 2, 2, 2, 2),
-        upsample_kernel_sizes: Tuple[int] = (8, 8, 4, 4, 4, 4, 4),
-        resblock_kernel_sizes: Tuple[int] = (3, 7, 11, 13),
-        resblock_dilation_sizes: Tuple[Tuple[int]] = (
-            (1, 3, 5), (1, 3, 5), (1, 3, 5), (1, 3, 5)),
-        num_mels: int = 512,
-        upsample_initial_channel: int = 1024,
-        use_template: bool = False,
-        pre_conv_kernel_size: int = 13,
-        post_conv_kernel_size: int = 13,
-        sampling_rate: int = 44100,
-        n_fft: int = 2048,
-        win_length: int = 2048,
-        hop_length: int = 512,
-        f_min: int = 40,
-        f_max: int = 16000,
-        n_mels: int = 128,
-    ):
-        super().__init__()
-
-        self.backbone = ConvNeXtEncoder(
-            input_channels=input_channels,
-            depths=depths,
-            dims=dims,
-            drop_path_rate=drop_path_rate,
-            kernel_sizes=kernel_sizes,
-        )
-
-        self.head = HiFiGANGenerator(
-            hop_length=hop_length,
-            upsample_rates=upsample_rates,
-            upsample_kernel_sizes=upsample_kernel_sizes,
-            resblock_kernel_sizes=resblock_kernel_sizes,
-            resblock_dilation_sizes=resblock_dilation_sizes,
-            num_mels=num_mels,
-            upsample_initial_channel=upsample_initial_channel,
-            use_template=use_template,
-            pre_conv_kernel_size=pre_conv_kernel_size,
-            post_conv_kernel_size=post_conv_kernel_size,
-        )
-        self.sampling_rate = sampling_rate
-        self.mel_transform = LogMelSpectrogram(
-            sample_rate=sampling_rate,
-            n_fft=n_fft,
-            win_length=win_length,
-            hop_length=hop_length,
-            f_min=f_min,
-            f_max=f_max,
-            n_mels=n_mels,
-        )
-        self.eval()
-
-    @torch.no_grad()
-    def decode(self, mel):
-        y = self.backbone(mel)
-        y = self.head(y)
-        return y
-
-    @torch.no_grad()
-    def encode(self, x):
-        return self.mel_transform(x)
-
-    def forward(self, mel):
-        y = self.backbone(mel)
-        y = self.head(y)
-        return y
--- a/comfy/ldm/audio/autoencoder.py
+++ b/comfy/ldm/audio/autoencoder.py
@@ -75,10 +75,16 @@ class SnakeBeta(nn.Module):
        return x

 def WNConv1d(*args, **kwargs):
-    return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
+    try:
+        return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
+    except:
+        return torch.nn.utils.weight_norm(ops.Conv1d(*args, **kwargs)) #support pytorch 2.1 and older

 def WNConvTranspose1d(*args, **kwargs):
-    return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
+    try:
+        return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
+    except:
+        return torch.nn.utils.weight_norm(ops.ConvTranspose1d(*args, **kwargs)) #support pytorch 2.1 and older

 def get_activation(activation: Literal["elu", "snake", "none"], antialias=False, channels=None) -> nn.Module:
    if activation == "elu":
--- a/comfy/ldm/cascade/stage_a.py
+++ b/comfy/ldm/cascade/stage_a.py
@@ -19,10 +19,6 @@
 import torch
 from torch import nn
 from torch.autograd import Function
-import comfy.ops
-
-ops = comfy.ops.disable_weight_init
-

 class vector_quantize(Function):
    @staticmethod
@@ -125,15 +121,15 @@ class ResBlock(nn.Module):
        self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.depthwise = nn.Sequential(
            nn.ReplicationPad2d(1),
-            ops.Conv2d(c, c, kernel_size=3, groups=c)
+            nn.Conv2d(c, c, kernel_size=3, groups=c)
        )

        # channelwise
        self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.channelwise = nn.Sequential(
-            ops.Linear(c, c_hidden),
+            nn.Linear(c, c_hidden),
            nn.GELU(),
-            ops.Linear(c_hidden, c),
+            nn.Linear(c_hidden, c),
        )

        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
@@ -175,16 +171,16 @@ class StageA(nn.Module):
        # Encoder blocks
        self.in_block = nn.Sequential(
            nn.PixelUnshuffle(2),
-            ops.Conv2d(3 * 4, c_levels[0], kernel_size=1)
+            nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
        )
        down_blocks = []
        for i in range(levels):
            if i > 0:
-                down_blocks.append(ops.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
            block = ResBlock(c_levels[i], c_levels[i] * 4)
            down_blocks.append(block)
        down_blocks.append(nn.Sequential(
-            ops.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
        ))
        self.down_blocks = nn.Sequential(*down_blocks)
@@ -195,7 +191,7 @@ class StageA(nn.Module):

        # Decoder blocks
        up_blocks = [nn.Sequential(
-            ops.Conv2d(c_latent, c_levels[-1], kernel_size=1)
+            nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
        )]
        for i in range(levels):
            for j in range(bottleneck_blocks if i == 0 else 1):
@@ -203,11 +199,11 @@ class StageA(nn.Module):
                up_blocks.append(block)
            if i < levels - 1:
                up_blocks.append(
-                    ops.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
+                    nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
                                       padding=1))
        self.up_blocks = nn.Sequential(*up_blocks)
        self.out_block = nn.Sequential(
-            ops.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
+            nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
            nn.PixelShuffle(2),
        )

@@ -236,17 +232,17 @@ class Discriminator(nn.Module):
        super().__init__()
        d = max(depth - 3, 3)
        layers = [
-            nn.utils.spectral_norm(ops.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
+            nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
        ]
        for i in range(depth - 1):
            c_in = c_hidden // (2 ** max((d - i), 0))
            c_out = c_hidden // (2 ** max((d - 1 - i), 0))
-            layers.append(nn.utils.spectral_norm(ops.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
            layers.append(nn.InstanceNorm2d(c_out))
            layers.append(nn.LeakyReLU(0.2))
        self.encoder = nn.Sequential(*layers)
-        self.shuffle = ops.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
+        self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
        self.logits = nn.Sigmoid()

    def forward(self, x, cond=None):
--- a/comfy/ldm/cascade/stage_c_coder.py
+++ b/comfy/ldm/cascade/stage_c_coder.py
@@ -19,9 +19,6 @@ import torch
 import torchvision
 from torch import nn

-import comfy.ops
-
-ops = comfy.ops.disable_weight_init

 # EfficientNet
 class EfficientNetEncoder(nn.Module):
@@ -29,7 +26,7 @@ class EfficientNetEncoder(nn.Module):
        super().__init__()
        self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
        self.mapper = nn.Sequential(
-            ops.Conv2d(1280, c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(1280, c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent, affine=False),  # then normalize them to have mean 0 and std 1
        )
        self.mean = nn.Parameter(torch.tensor([0.485, 0.456, 0.406]))
@@ -37,7 +34,7 @@ class EfficientNetEncoder(nn.Module):

    def forward(self, x):
        x = x * 0.5 + 0.5
-        x = (x - self.mean.view([3,1,1]).to(device=x.device, dtype=x.dtype)) / self.std.view([3,1,1]).to(device=x.device, dtype=x.dtype)
+        x = (x - self.mean.view([3,1,1])) / self.std.view([3,1,1])
        o = self.mapper(self.backbone(x))
        return o

@@ -47,39 +44,39 @@ class Previewer(nn.Module):
    def __init__(self, c_in=16, c_hidden=512, c_out=3):
        super().__init__()
        self.blocks = nn.Sequential(
-            ops.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
+            nn.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            ops.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            ops.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
+            nn.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            ops.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            ops.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
+            nn.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
+            nn.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_out, kernel_size=1),
+            nn.Conv2d(c_hidden // 4, c_out, kernel_size=1),
        )

    def forward(self, x):
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@@ -1,183 +0,0 @@
-import torch
-from torch import Tensor, nn
-
-from comfy.ldm.flux.math import attention
-from comfy.ldm.flux.layers import (
-    MLPEmbedder,
-    RMSNorm,
-    QKNorm,
-    SelfAttention,
-    ModulationOut,
-)
-
-
-
-class ChromaModulationOut(ModulationOut):
-    @classmethod
-    def from_offset(cls, tensor: torch.Tensor, offset: int = 0) -> ModulationOut:
-        return cls(
-            shift=tensor[:, offset : offset + 1, :],
-            scale=tensor[:, offset + 1 : offset + 2, :],
-            gate=tensor[:, offset + 2 : offset + 3, :],
-        )
-
-
-
-
-class Approximator(nn.Module):
-    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers = 5, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
-        self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
-        self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
-        self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device)
-
-    @property
-    def device(self):
-        # Get the device of the module (assumes all parameters are on the same device)
-        return next(self.parameters()).device
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.in_proj(x)
-
-        for layer, norms in zip(self.layers, self.norms):
-            x = x + layer(norms(x))
-
-        x = self.out_proj(x)
-
-        return x
-
-
-class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
-        super().__init__()
-
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-        self.flipped_img_txt = flipped_img_txt
-
-    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None):
-        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
-
-        # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-
-        # run actual attention
-        attn = attention(torch.cat((txt_q, img_q), dim=2),
-                         torch.cat((txt_k, img_k), dim=2),
-                         torch.cat((txt_v, img_v), dim=2),
-                         pe=pe, mask=attn_mask)
-
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-
-        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
-
-        # calculate the txt bloks
-        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
-
-        if txt.dtype == torch.float16:
-            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
-
-        return img, txt
-
-
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float = None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
-        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
-
-        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-
-        self.hidden_size = hidden_size
-        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-
-        self.mlp_act = nn.GELU(approximate="tanh")
-
-    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor:
-        mod = vec
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-
-        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k = self.norm(q, k, v)
-
-        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += mod.gate * output
-        if x.dtype == torch.float16:
-            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
-        return x
-
-
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
-
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = vec
-        shift = shift.squeeze(1)
-        scale = scale.squeeze(1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-        x = self.linear(x)
-        return x
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@@ -1,271 +0,0 @@
-#Original code can be found on: https://github.com/black-forest-labs/flux
-
-from dataclasses import dataclass
-
-import torch
-from torch import Tensor, nn
-from einops import rearrange, repeat
-import comfy.ldm.common_dit
-
-from comfy.ldm.flux.layers import (
-    EmbedND,
-    timestep_embedding,
-)
-
-from .layers import (
-    DoubleStreamBlock,
-    LastLayer,
-    SingleStreamBlock,
-    Approximator,
-    ChromaModulationOut,
-)
-
-
-@dataclass
-class ChromaParams:
-    in_channels: int
-    out_channels: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list
-    theta: int
-    patch_size: int
-    qkv_bias: bool
-    in_dim: int
-    out_dim: int
-    hidden_dim: int
-    n_layers: int
-
-
-
-
-class Chroma(nn.Module):
-    """
-    Transformer model for flow matching on sequences.
-    """
-
-    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        self.dtype = dtype
-        params = ChromaParams(**kwargs)
-        self.params = params
-        self.patch_size = params.patch_size
-        self.in_channels = params.in_channels
-        self.out_channels = params.out_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.in_dim = params.in_dim
-        self.out_dim = params.out_dim
-        self.hidden_dim = params.hidden_dim
-        self.n_layers = params.n_layers
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
-        # set as nn identity for now, will overwrite it later.
-        self.distilled_guidance_layer = Approximator(
-                    in_dim=self.in_dim,
-                    hidden_dim=self.hidden_dim,
-                    out_dim=self.out_dim,
-                    n_layers=self.n_layers,
-                    dtype=dtype, device=device, operations=operations
-                )
-
-
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(params.depth)
-            ]
-        )
-
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-
-        if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
-
-        self.skip_mmdit = []
-        self.skip_dit = []
-        self.lite = False
-
-    def get_modulations(self, tensor: torch.Tensor, block_type: str, *, idx: int = 0):
-        # This function slices up the modulations tensor which has the following layout:
-        #   single     : num_single_blocks * 3 elements
-        #   double_img : num_double_blocks * 6 elements
-        #   double_txt : num_double_blocks * 6 elements
-        #   final      : 2 elements
-        if block_type == "final":
-            return (tensor[:, -2:-1, :], tensor[:, -1:, :])
-        single_block_count = self.params.depth_single_blocks
-        double_block_count = self.params.depth
-        offset = 3 * idx
-        if block_type == "single":
-            return ChromaModulationOut.from_offset(tensor, offset)
-        # Double block modulations are 6 elements so we double 3 * idx.
-        offset *= 2
-        if block_type in {"double_img", "double_txt"}:
-            # Advance past the single block modulations.
-            offset += 3 * single_block_count
-            if block_type == "double_txt":
-                # Advance past the double block img modulations.
-                offset += 6 * double_block_count
-            return (
-                ChromaModulationOut.from_offset(tensor, offset),
-                ChromaModulationOut.from_offset(tensor, offset + 3),
-            )
-        raise ValueError("Bad block_type")
-
-
-    def forward_orig(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        guidance: Tensor = None,
-        control = None,
-        transformer_options={},
-        attn_mask: Tensor = None,
-    ) -> Tensor:
-        patches_replace = transformer_options.get("patches_replace", {})
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-
-        # running on sequences img
-        img = self.img_in(img)
-
-        # distilled vector guidance
-        mod_index_length = 344
-        distill_timestep = timestep_embedding(timesteps.detach().clone(), 16).to(img.device, img.dtype)
-        # guidance = guidance *
-        distil_guidance = timestep_embedding(guidance.detach().clone(), 16).to(img.device, img.dtype)
-
-        # get all modulation index
-        modulation_index = timestep_embedding(torch.arange(mod_index_length), 32).to(img.device, img.dtype)
-        # we need to broadcast the modulation index here so each batch has all of the index
-        modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1).to(img.device, img.dtype)
-        # and we need to broadcast timestep and guidance along too
-        timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1).to(img.dtype).to(img.device, img.dtype)
-        # then and only then we could concatenate it together
-        input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1).to(img.device, img.dtype)
-
-        mod_vectors = self.distilled_guidance_layer(input_vec)
-
-        txt = self.txt_in(txt)
-
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.double_blocks):
-            if i not in self.skip_mmdit:
-                double_mod = (
-                    self.get_modulations(mod_vectors, "double_img", idx=i),
-                    self.get_modulations(mod_vectors, "double_txt", idx=i),
-                )
-                if ("double_block", i) in blocks_replace:
-                    def block_wrap(args):
-                        out = {}
-                        out["img"], out["txt"] = block(img=args["img"],
-                                                       txt=args["txt"],
-                                                       vec=args["vec"],
-                                                       pe=args["pe"],
-                                                       attn_mask=args.get("attn_mask"))
-                        return out
-
-                    out = blocks_replace[("double_block", i)]({"img": img,
-                                                               "txt": txt,
-                                                               "vec": double_mod,
-                                                               "pe": pe,
-                                                               "attn_mask": attn_mask},
-                                                              {"original_block": block_wrap})
-                    txt = out["txt"]
-                    img = out["img"]
-                else:
-                    img, txt = block(img=img,
-                                     txt=txt,
-                                     vec=double_mod,
-                                     pe=pe,
-                                     attn_mask=attn_mask)
-
-                if control is not None: # Controlnet
-                    control_i = control.get("input")
-                    if i < len(control_i):
-                        add = control_i[i]
-                        if add is not None:
-                            img += add
-
-        img = torch.cat((txt, img), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            if i not in self.skip_dit:
-                single_mod = self.get_modulations(mod_vectors, "single", idx=i)
-                if ("single_block", i) in blocks_replace:
-                    def block_wrap(args):
-                        out = {}
-                        out["img"] = block(args["img"],
-                                           vec=args["vec"],
-                                           pe=args["pe"],
-                                           attn_mask=args.get("attn_mask"))
-                        return out
-
-                    out = blocks_replace[("single_block", i)]({"img": img,
-                                                               "vec": single_mod,
-                                                               "pe": pe,
-                                                               "attn_mask": attn_mask},
-                                                              {"original_block": block_wrap})
-                    img = out["img"]
-                else:
-                    img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask)
-
-                if control is not None: # Controlnet
-                    control_o = control.get("output")
-                    if i < len(control_o):
-                        add = control_o[i]
-                        if add is not None:
-                            img[:, txt.shape[1] :, ...] += add
-
-        img = img[:, txt.shape[1] :, ...]
-        final_mod = self.get_modulations(mod_vectors, "final")
-        img = self.final_layer(img, vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
-        return img
-
-    def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
-        bs, c, h, w = x.shape
-        patch_size = 2
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
-
-        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-
-        h_len = ((h + (patch_size // 2)) // patch_size)
-        w_len = ((w + (patch_size // 2)) // patch_size)
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
-
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@@ -1,6 +1,5 @@
 import torch
-import comfy.rmsnorm
-
+import comfy.ops

 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
    if padding_mode == "circular" and (torch.jit.is_tracing() or torch.jit.is_scripting()):
@@ -12,5 +11,20 @@ def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):

    return torch.nn.functional.pad(img, pad, mode=padding_mode)

+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+except:
+    rms_norm_torch = None

-rms_norm = comfy.rmsnorm.rms_norm
+def rms_norm(x, weight=None, eps=1e-6):
+    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        if weight is None:
+            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
+        else:
+            return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        if weight is None:
+            return r
+        else:
+            return r * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@@ -23,6 +23,7 @@ from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
 from torch import nn

+from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
 from comfy.ldm.modules.attention import optimized_attention


@@ -36,11 +37,11 @@ def apply_rotary_pos_emb(
    return t_out


-def get_normalization(name: str, channels: int, weight_args={}, operations=None):
+def get_normalization(name: str, channels: int, weight_args={}):
    if name == "I":
        return nn.Identity()
    elif name == "R":
-        return operations.RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
+        return RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
    else:
        raise ValueError(f"Normalization {name} not found")

@@ -119,15 +120,15 @@ class Attention(nn.Module):

        self.to_q = nn.Sequential(
            operations.Linear(query_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[0], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[0], norm_dim),
        )
        self.to_k = nn.Sequential(
            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[1], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[1], norm_dim),
        )
        self.to_v = nn.Sequential(
            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[2], norm_dim, weight_args=weight_args, operations=operations),
+            get_normalization(qkv_norm[2], norm_dim),
        )

        self.to_out = nn.Sequential(
@@ -167,19 +168,15 @@ class Attention(nn.Module):
        k = self.to_k[1](k)
        v = self.to_v[1](v)
        if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
-            # apply_rotary_pos_emb inlined
-            q_shape = q.shape
-            q = q.reshape(*q.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
-            q = rope_emb[..., 0] * q[..., 0] + rope_emb[..., 1] * q[..., 1]
-            q = q.movedim(-1, -2).reshape(*q_shape).to(x.dtype)
-
-            # apply_rotary_pos_emb inlined
-            k_shape = k.shape
-            k = k.reshape(*k.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
-            k = rope_emb[..., 0] * k[..., 0] + rope_emb[..., 1] * k[..., 1]
-            k = k.movedim(-1, -2).reshape(*k_shape).to(x.dtype)
+            q = apply_rotary_pos_emb(q, rope_emb)
+            k = apply_rotary_pos_emb(k, rope_emb)
        return q, k, v

+    def cal_attn(self, q, k, v, mask=None):
+        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
+        out = rearrange(out, " b n s c -> s b (n c)")
+        return self.to_out(out)
+
    def forward(
        self,
        x,
@@ -194,10 +191,7 @@ class Attention(nn.Module):
            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
        """
        q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs)
-        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
-        del q, k, v
-        out = rearrange(out, " b n s c -> s b (n c)")
-        return self.to_out(out)
+        return self.cal_attn(q, k, v, mask)


 class FeedForward(nn.Module):
@@ -794,7 +788,10 @@ class GeneralDITTransformerBlock(nn.Module):
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
+        if extra_per_block_pos_emb is not None:
+            x = x + extra_per_block_pos_emb
        for block in self.blocks:
            x = block(
                x,
--- a/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
@@ -30,8 +30,6 @@ import torch.nn as nn
 import torch.nn.functional as F
 import logging

-from comfy.ldm.modules.diffusionmodules.model import vae_attention
-
 from .patching import (
    Patcher,
    Patcher3D,
@@ -402,8 +400,6 @@ class CausalAttnBlock(nn.Module):
            in_channels, in_channels, kernel_size=1, stride=1, padding=0
        )

-        self.optimized_attention = vae_attention()
-
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h_ = x
        h_ = self.norm(h_)
@@ -417,7 +413,18 @@ class CausalAttnBlock(nn.Module):
        v, batch_size = time2batch(v)

        b, c, h, w = q.shape
-        h_ = self.optimized_attention(q, k, v)
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)
+        k = k.reshape(b, c, h * w)
+        w_ = torch.bmm(q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)
+        h_ = torch.bmm(v, w_)
+        h_ = h_.reshape(b, c, h, w)

        h_ = batch2time(h_, batch_size)
        h_ = self.proj_out(h_)
@@ -864,16 +871,18 @@ class EncoderFactorized(nn.Module):
        x = self.patcher3d(x)

        # downsampling
-        h = self.conv_in(x)
+        hs = [self.conv_in(x)]
        for i_level in range(self.num_resolutions):
            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](h)
+                h = self.down[i_level].block[i_block](hs[-1])
                if len(self.down[i_level].attn) > 0:
                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
            if i_level != self.num_resolutions - 1:
-                h = self.down[i_level].downsample(h)
+                hs.append(self.down[i_level].downsample(hs[-1]))

        # middle
+        h = hs[-1]
        h = self.mid.block_1(h)
        h = self.mid.attn_1(h)
        h = self.mid.block_2(h)
--- a/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
@@ -281,76 +281,54 @@ class UnPatcher3D(UnPatcher):
        hh = hh.to(dtype=dtype)

        xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh = torch.chunk(x, 8, dim=1)
-        del x

        # Height height transposed convolutions.
        xll = F.conv_transpose3d(
            xlll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xlll
-
        xll += F.conv_transpose3d(
            xllh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xllh

        xlh = F.conv_transpose3d(
            xlhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xlhl
-
        xlh += F.conv_transpose3d(
            xlhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xlhh

        xhl = F.conv_transpose3d(
            xhll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhll
-
        xhl += F.conv_transpose3d(
            xhlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhlh

        xhh = F.conv_transpose3d(
            xhhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhhl
-
        xhh += F.conv_transpose3d(
            xhhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
        )
-        del xhhh

        # Handles width transposed convolutions.
        xl = F.conv_transpose3d(
            xll, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xll
-
        xl += F.conv_transpose3d(
            xlh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xlh
-
        xh = F.conv_transpose3d(
            xhl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xhl
-
        xh += F.conv_transpose3d(
            xhh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
        )
-        del xhh

        # Handles time axis transposed convolutions.
        x = F.conv_transpose3d(
            xl, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
        )
-        del xl
-
        x += F.conv_transpose3d(
            xh, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
        )
--- a/comfy/ldm/cosmos/model.py
+++ b/comfy/ldm/cosmos/model.py
@@ -27,6 +27,8 @@ from torchvision import transforms
 from enum import Enum
 import logging

+from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
+
 from .blocks import (
    FinalLayer,
    GeneralDITTransformerBlock,
@@ -166,7 +168,7 @@ class GeneralDIT(nn.Module):
            operations=operations,
        )

-        self.build_pos_embed(device=device, dtype=dtype)
+        self.build_pos_embed(device=device)
        self.block_x_format = block_x_format
        self.use_adaln_lora = use_adaln_lora
        self.adaln_lora_dim = adaln_lora_dim
@@ -193,7 +195,7 @@ class GeneralDIT(nn.Module):

        if self.affline_emb_norm:
            logging.debug("Building affine embedding normalization layer")
-            self.affline_norm = operations.RMSNorm(model_channels, elementwise_affine=True, eps=1e-6, device=device, dtype=dtype)
+            self.affline_norm = RMSNorm(model_channels, elementwise_affine=True, eps=1e-6)
        else:
            self.affline_norm = nn.Identity()

@@ -208,7 +210,7 @@ class GeneralDIT(nn.Module):
            operations=operations,
        )

-    def build_pos_embed(self, device=None, dtype=None):
+    def build_pos_embed(self, device=None):
        if self.pos_emb_cls == "rope3d":
            cls_type = VideoRopePosition3DEmb
        else:
@@ -240,7 +242,6 @@ class GeneralDIT(nn.Module):
            kwargs["w_extrapolation_ratio"] = self.extra_w_extrapolation_ratio
            kwargs["t_extrapolation_ratio"] = self.extra_t_extrapolation_ratio
            kwargs["device"] = device
-            kwargs["dtype"] = dtype
            self.extra_pos_embedder = LearnablePosEmbAxis(
                **kwargs,
            )
@@ -291,7 +292,7 @@ class GeneralDIT(nn.Module):
        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)

        if self.extra_per_block_abs_pos_emb:
-            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype)
+            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device)
        else:
            extra_pos_emb = None

@@ -475,8 +476,6 @@ class GeneralDIT(nn.Module):
            inputs["original_shape"],
        )
        extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = inputs["extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D"].to(x.dtype)
-        del inputs
-
        if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
            assert (
                x.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
@@ -487,8 +486,6 @@ class GeneralDIT(nn.Module):
                self.blocks["block0"].x_format == block.x_format
            ), f"First block has x_format {self.blocks[0].x_format}, got {block.x_format}"

-            if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
-                x += extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D
            x = block(
                x,
                affline_emb_B_D,
@@ -496,6 +493,7 @@ class GeneralDIT(nn.Module):
                crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
                adaln_lora_B_3D=adaln_lora_B_3D,
+                extra_per_block_pos_emb=extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
            )

        x_B_T_H_W_D = rearrange(x, "T H W B D -> B T H W D")
--- a/comfy/ldm/cosmos/position_embedding.py
+++ b/comfy/ldm/cosmos/position_embedding.py
@@ -41,12 +41,12 @@ def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: float = 0)


 class VideoPositionEmb(nn.Module):
-    def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+    def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None) -> torch.Tensor:
        """
        It delegates the embedding generation to generate_embeddings function.
        """
        B_T_H_W_C = x_B_T_H_W_C.shape
-        embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device, dtype=dtype)
+        embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device)

        return embeddings

@@ -104,7 +104,6 @@ class VideoRopePosition3DEmb(VideoPositionEmb):
        w_ntk_factor: Optional[float] = None,
        t_ntk_factor: Optional[float] = None,
        device=None,
-        dtype=None,
    ):
        """
        Generate embeddings for the given input size.
@@ -174,7 +173,6 @@ class LearnablePosEmbAxis(VideoPositionEmb):
        len_w: int,
        len_t: int,
        device=None,
-        dtype=None,
        **kwargs,
    ):
        """
@@ -186,16 +184,17 @@ class LearnablePosEmbAxis(VideoPositionEmb):
        self.interpolation = interpolation
        assert self.interpolation in ["crop"], f"Unknown interpolation method {self.interpolation}"

-        self.pos_emb_h = nn.Parameter(torch.empty(len_h, model_channels, device=device, dtype=dtype))
-        self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device, dtype=dtype))
-        self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device, dtype=dtype))
+        self.pos_emb_h = nn.Parameter(torch.empty(len_h, model_channels, device=device))
+        self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device))
+        self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device))

-    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+
+    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None) -> torch.Tensor:
        B, T, H, W, _ = B_T_H_W_C
        if self.interpolation == "crop":
-            emb_h_H = self.pos_emb_h[:H].to(device=device, dtype=dtype)
-            emb_w_W = self.pos_emb_w[:W].to(device=device, dtype=dtype)
-            emb_t_T = self.pos_emb_t[:T].to(device=device, dtype=dtype)
+            emb_h_H = self.pos_emb_h[:H].to(device=device)
+            emb_w_W = self.pos_emb_w[:W].to(device=device)
+            emb_t_T = self.pos_emb_t[:T].to(device=device)
            emb = (
                repeat(emb_t_T, "t d-> b t h w d", b=B, h=H, w=W)
                + repeat(emb_h_H, "h d-> b t h w d", b=B, t=T, w=W)
--- a/comfy/ldm/cosmos/vae.py
+++ b/comfy/ldm/cosmos/vae.py
@@ -18,7 +18,6 @@ import logging
 import torch
 from torch import nn
 from enum import Enum
-import math

 from .cosmos_tokenizer.layers3d import (
    EncoderFactorized,
@@ -90,8 +89,8 @@ class CausalContinuousVideoTokenizer(nn.Module):
        self.distribution = IdentityDistribution()  # ContinuousFormulation[formulation_name].value()

        num_parameters = sum(param.numel() for param in self.parameters())
-        logging.debug(f"model={self.name}, num_parameters={num_parameters:,}")
-        logging.debug(
+        logging.info(f"model={self.name}, num_parameters={num_parameters:,}")
+        logging.info(
            f"z_channels={z_channels}, latent_channels={self.latent_channels}."
        )

@@ -106,23 +105,17 @@ class CausalContinuousVideoTokenizer(nn.Module):
        z, posteriors = self.distribution(moments)
        latent_ch = z.shape[1]
        latent_t = z.shape[2]
-        in_dtype = z.dtype
-        mean = self.latent_mean.view(latent_ch, -1)
-        std = self.latent_std.view(latent_ch, -1)
-
-        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        dtype = z.dtype
+        mean = self.latent_mean.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=dtype, device=z.device)
+        std = self.latent_std.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=dtype, device=z.device)
        return ((z - mean) / std) * self.sigma_data

    def decode(self, z):
        in_dtype = z.dtype
        latent_ch = z.shape[1]
        latent_t = z.shape[2]
-        mean = self.latent_mean.view(latent_ch, -1)
-        std = self.latent_std.view(latent_ch, -1)
-
-        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        mean = self.latent_mean.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        std = self.latent_std.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)

        z = z / self.sigma_data
        z = z * std + mean
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -105,9 +105,7 @@ class Modulation(nn.Module):
        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)

    def forward(self, vec: Tensor) -> tuple:
-        if vec.ndim == 2:
-            vec = vec[:, None, :]
-        out = self.lin(nn.functional.silu(vec)).chunk(self.multiplier, dim=-1)
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)

        return (
            ModulationOut(*out[:3]),
@@ -115,20 +113,6 @@ class Modulation(nn.Module):
        )


-def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
-    if modulation_dims is None:
-        if m_add is not None:
-            return tensor * m_mult + m_add
-        else:
-            return tensor * m_mult
-    else:
-        for d in modulation_dims:
-            tensor[:, d[0]:d[1]] *= m_mult[:, d[2]]
-            if m_add is not None:
-                tensor[:, d[0]:d[1]] += m_add[:, d[2]]
-        return tensor
-
-
 class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()
@@ -159,20 +143,20 @@ class DoubleStreamBlock(nn.Module):
        )
        self.flipped_img_txt = flipped_img_txt

-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)

        # prepare image for attention
        img_modulated = self.img_norm1(img)
-        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
@@ -195,12 +179,12 @@ class DoubleStreamBlock(nn.Module):
            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]

        # calculate the img bloks
-        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
-        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)

        # calculate the txt bloks
-        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
-        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
+        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)

        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
@@ -244,9 +228,10 @@ class SingleStreamBlock(nn.Module):
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)

-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None) -> Tensor:
        mod, _ = self.modulation(vec)
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)

        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)
@@ -255,7 +240,7 @@ class SingleStreamBlock(nn.Module):
        attn = attention(q, k, v, pe=pe, mask=attn_mask)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += apply_mod(output, mod.gate, None, modulation_dims)
+        x += mod.gate * output
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
@@ -268,11 +253,8 @@ class LastLayer(nn.Module):
        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))

-    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
-        if vec.ndim == 2:
-            vec = vec[:, None, :]
-
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=-1)
-        x = apply_mod(self.norm_final(x), (1 + scale), shift, modulation_dims)
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        x = self.linear(x)
        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -5,16 +5,8 @@ from torch import Tensor
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management

-
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
-    q_shape = q.shape
-    k_shape = k.shape
-
-    if pe is not None:
-        q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
-        k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
-        q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
-        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
+    q, k = apply_rope(q, k, pe)

    heads = q.shape[1]
    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask)
@@ -23,7 +15,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:

 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
-    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
+    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu():
        device = torch.device("cpu")
    else:
        device = pos.device
@@ -37,8 +29,8 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:


 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -109,17 +109,15 @@ class Flux(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256).to(img.dtype))
        if self.params.guidance_embed:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
        txt = self.txt_in(txt)

-        if img_ids is not None:
-            ids = torch.cat((txt_ids, img_ids), dim=1)
-            pe = self.pe_embedder(ids)
-        else:
-            pe = None
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)

        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.double_blocks):
@@ -188,7 +186,7 @@ class Flux(nn.Module):
        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img

-    def forward(self, x, timestep, context, y, guidance=None, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y, guidance, control=None, transformer_options={}, **kwargs):
        bs, c, h, w = x.shape
        patch_size = self.patch_size
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@@ -13,6 +13,7 @@ from comfy.ldm.modules.attention import optimized_attention
 from .layers import (
    FeedForward,
    PatchEmbed,
+    RMSNorm,
    TimestepEmbedder,
 )

@@ -89,10 +90,10 @@ class AsymmetricAttention(nn.Module):

        # Query and key normalization for stability.
        assert qk_norm
-        self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
-        self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
-        self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
-        self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-5, device=device, dtype=dtype)
+        self.q_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.k_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.q_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
+        self.k_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)

        # Output layers. y features go back down from dim_x -> dim_y.
        self.proj_x = operations.Linear(dim_x, dim_x, bias=out_bias, device=device, dtype=dtype)
--- a/comfy/ldm/genmo/joint_model/layers.py
+++ b/comfy/ldm/genmo/joint_model/layers.py
@@ -151,3 +151,14 @@ class PatchEmbed(nn.Module):

        x = self.norm(x)
        return x
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, device=device, dtype=dtype))
+        self.register_parameter("bias", None)
+
+    def forward(self, x):
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@@ -1,802 +0,0 @@
-from typing import Optional, Tuple, List
-
-import torch
-import torch.nn as nn
-import einops
-from einops import repeat
-
-from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
-import torch.nn.functional as F
-
-from comfy.ldm.flux.math import apply_rope, rope
-from comfy.ldm.flux.layers import LastLayer
-
-from comfy.ldm.modules.attention import optimized_attention
-import comfy.model_management
-import comfy.ldm.common_dit
-
-
-# Copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
-class EmbedND(nn.Module):
-    def __init__(self, theta: int, axes_dim: List[int]):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
-            dim=-3,
-        )
-        return emb.unsqueeze(2)
-
-
-class PatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size=2,
-        in_channels=4,
-        out_channels=1024,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.patch_size = patch_size
-        self.out_channels = out_channels
-        self.proj = operations.Linear(in_channels * patch_size * patch_size, out_channels, bias=True, dtype=dtype, device=device)
-
-    def forward(self, latent):
-        latent = self.proj(latent)
-        return latent
-
-
-class PooledEmbed(nn.Module):
-    def __init__(self, text_emb_dim, hidden_size, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.pooled_embedder = TimestepEmbedding(in_channels=text_emb_dim, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, pooled_embed):
-        return self.pooled_embedder(pooled_embed)
-
-
-class TimestepEmbed(nn.Module):
-    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, timesteps, wdtype):
-        t_emb = self.time_proj(timesteps).to(dtype=wdtype)
-        t_emb = self.timestep_embedder(t_emb)
-        return t_emb
-
-
-def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
-    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])
-
-
-class HiDreamAttnProcessor_flashattn:
-    """Attention processor used typically in processing the SD3-like self-attention projections."""
-
-    def __call__(
-        self,
-        attn,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-        *args,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        dtype = image_tokens.dtype
-        batch_size = image_tokens.shape[0]
-
-        query_i = attn.q_rms_norm(attn.to_q(image_tokens)).to(dtype=dtype)
-        key_i = attn.k_rms_norm(attn.to_k(image_tokens)).to(dtype=dtype)
-        value_i = attn.to_v(image_tokens)
-
-        inner_dim = key_i.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query_i = query_i.view(batch_size, -1, attn.heads, head_dim)
-        key_i = key_i.view(batch_size, -1, attn.heads, head_dim)
-        value_i = value_i.view(batch_size, -1, attn.heads, head_dim)
-        if image_tokens_masks is not None:
-            key_i = key_i * image_tokens_masks.view(batch_size, -1, 1, 1)
-
-        if not attn.single:
-            query_t = attn.q_rms_norm_t(attn.to_q_t(text_tokens)).to(dtype=dtype)
-            key_t = attn.k_rms_norm_t(attn.to_k_t(text_tokens)).to(dtype=dtype)
-            value_t = attn.to_v_t(text_tokens)
-
-            query_t = query_t.view(batch_size, -1, attn.heads, head_dim)
-            key_t = key_t.view(batch_size, -1, attn.heads, head_dim)
-            value_t = value_t.view(batch_size, -1, attn.heads, head_dim)
-
-            num_image_tokens = query_i.shape[1]
-            num_text_tokens = query_t.shape[1]
-            query = torch.cat([query_i, query_t], dim=1)
-            key = torch.cat([key_i, key_t], dim=1)
-            value = torch.cat([value_i, value_t], dim=1)
-        else:
-            query = query_i
-            key = key_i
-            value = value_i
-
-        if query.shape[-1] == rope.shape[-3] * 2:
-            query, key = apply_rope(query, key, rope)
-        else:
-            query_1, query_2 = query.chunk(2, dim=-1)
-            key_1, key_2 = key.chunk(2, dim=-1)
-            query_1, key_1 = apply_rope(query_1, key_1, rope)
-            query = torch.cat([query_1, query_2], dim=-1)
-            key = torch.cat([key_1, key_2], dim=-1)
-
-        hidden_states = attention(query, key, value)
-
-        if not attn.single:
-            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
-            hidden_states_i = attn.to_out(hidden_states_i)
-            hidden_states_t = attn.to_out_t(hidden_states_t)
-            return hidden_states_i, hidden_states_t
-        else:
-            hidden_states = attn.to_out(hidden_states)
-            return hidden_states
-
-class HiDreamAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        upcast_attention: bool = False,
-        upcast_softmax: bool = False,
-        scale_qk: bool = True,
-        eps: float = 1e-5,
-        processor = None,
-        out_dim: int = None,
-        single: bool = False,
-        dtype=None, device=None, operations=None
-    ):
-        # super(Attention, self).__init__()
-        super().__init__()
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.query_dim = query_dim
-        self.upcast_attention = upcast_attention
-        self.upcast_softmax = upcast_softmax
-        self.out_dim = out_dim if out_dim is not None else query_dim
-
-        self.scale_qk = scale_qk
-        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
-
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        self.sliceable_head_dim = heads
-        self.single = single
-
-        linear_cls = operations.Linear
-        self.linear_cls = linear_cls
-        self.to_q = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_k = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_v = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_out = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
-        self.q_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-        self.k_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-
-        if not single:
-            self.to_q_t = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_k_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_v_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_out_t = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
-            self.q_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-            self.k_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-
-        self.processor = processor
-
-    def forward(
-        self,
-        norm_image_tokens: torch.FloatTensor,
-        image_tokens_masks: torch.FloatTensor = None,
-        norm_text_tokens: torch.FloatTensor = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.Tensor:
-        return self.processor(
-            self,
-            image_tokens = norm_image_tokens,
-            image_tokens_masks = image_tokens_masks,
-            text_tokens = norm_text_tokens,
-            rope = rope,
-        )
-
-
-class FeedForwardSwiGLU(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int = 256,
-        ffn_dim_multiplier: Optional[float] = None,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * (
-            (hidden_dim + multiple_of - 1) // multiple_of
-        )
-
-        self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
-        self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
-        self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
-
-    def forward(self, x):
-        return self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
-
-
-# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-class MoEGate(nn.Module):
-    def __init__(self, embed_dim, num_routed_experts=4, num_activated_experts=2, aux_loss_alpha=0.01, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.top_k = num_activated_experts
-        self.n_routed_experts = num_routed_experts
-
-        self.scoring_func = 'softmax'
-        self.alpha = aux_loss_alpha
-        self.seq_aux = False
-
-        # topk selection algorithm
-        self.norm_topk_prob = False
-        self.gating_dim = embed_dim
-        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), dtype=dtype, device=device))
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        pass
-        # import torch.nn.init  as init
-        # init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-
-    def forward(self, hidden_states):
-        bsz, seq_len, h = hidden_states.shape
-
-        ### compute gating score
-        hidden_states = hidden_states.view(-1, h)
-        logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), None)
-        if self.scoring_func == 'softmax':
-            scores = logits.softmax(dim=-1)
-        else:
-            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
-
-        ### select top-k experts
-        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
-
-        ### norm gate to sum 1
-        if self.top_k > 1 and self.norm_topk_prob:
-            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
-            topk_weight = topk_weight / denominator
-
-        aux_loss = None
-        return topk_idx, topk_weight, aux_loss
-
-
-# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-class MOEFeedForwardSwiGLU(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        num_routed_experts: int,
-        num_activated_experts: int,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.shared_experts = FeedForwardSwiGLU(dim, hidden_dim // 2, dtype=dtype, device=device, operations=operations)
-        self.experts = nn.ModuleList([FeedForwardSwiGLU(dim, hidden_dim, dtype=dtype, device=device, operations=operations) for i in range(num_routed_experts)])
-        self.gate = MoEGate(
-            embed_dim = dim,
-            num_routed_experts = num_routed_experts,
-            num_activated_experts = num_activated_experts,
-            dtype=dtype, device=device, operations=operations
-        )
-        self.num_activated_experts = num_activated_experts
-
-    def forward(self, x):
-        wtype = x.dtype
-        identity = x
-        orig_shape = x.shape
-        topk_idx, topk_weight, aux_loss = self.gate(x)
-        x = x.view(-1, x.shape[-1])
-        flat_topk_idx = topk_idx.view(-1)
-        if True:  # self.training: # TODO: check which branch performs faster
-            x = x.repeat_interleave(self.num_activated_experts, dim=0)
-            y = torch.empty_like(x, dtype=wtype)
-            for i, expert in enumerate(self.experts):
-                y[flat_topk_idx == i] = expert(x[flat_topk_idx == i]).to(dtype=wtype)
-            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-            y =  y.view(*orig_shape).to(dtype=wtype)
-            #y = AddAuxiliaryLoss.apply(y, aux_loss)
-        else:
-            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
-        y = y + self.shared_experts(identity)
-        return y
-
-    @torch.no_grad()
-    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
-        expert_cache = torch.zeros_like(x)
-        idxs = flat_expert_indices.argsort()
-        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
-        token_idxs = idxs // self.num_activated_experts
-        for i, end_idx in enumerate(tokens_per_expert):
-            start_idx = 0 if i == 0 else tokens_per_expert[i-1]
-            if start_idx == end_idx:
-                continue
-            expert = self.experts[i]
-            exp_token_idx = token_idxs[start_idx:end_idx]
-            expert_tokens = x[exp_token_idx]
-            expert_out = expert(expert_tokens)
-            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
-
-            # for fp16 and other dtype
-            expert_cache = expert_cache.to(expert_out.dtype)
-            expert_cache.scatter_reduce_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
-        return expert_cache
-
-
-class TextProjection(nn.Module):
-    def __init__(self, in_features, hidden_size, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.linear = operations.Linear(in_features=in_features, out_features=hidden_size, bias=False, dtype=dtype, device=device)
-
-    def forward(self, caption):
-        hidden_states = self.linear(caption)
-        return hidden_states
-
-
-class BlockType:
-    TransformerBlock = 1
-    SingleTransformerBlock = 2
-
-
-class HiDreamImageSingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device)
-        )
-
-        # 1. Attention
-        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.attn1 = HiDreamAttention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            processor = HiDreamAttnProcessor_flashattn(),
-            single = True,
-            dtype=dtype, device=device, operations=operations
-        )
-
-        # 3. Feed-forward
-        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        if num_routed_experts > 0:
-            self.ff_i = MOEFeedForwardSwiGLU(
-                dim = dim,
-                hidden_dim = 4 * dim,
-                num_routed_experts = num_routed_experts,
-                num_activated_experts = num_activated_experts,
-                dtype=dtype, device=device, operations=operations
-            )
-        else:
-            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-
-    ) -> torch.FloatTensor:
-        wtype = image_tokens.dtype
-        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
-            self.adaLN_modulation(adaln_input)[:,None].chunk(6, dim=-1)
-
-        # 1. MM-Attention
-        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
-        attn_output_i = self.attn1(
-            norm_image_tokens,
-            image_tokens_masks,
-            rope = rope,
-        )
-        image_tokens = gate_msa_i * attn_output_i + image_tokens
-
-        # 2. Feed-forward
-        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
-        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens.to(dtype=wtype))
-        image_tokens = ff_output_i + image_tokens
-        return image_tokens
-
-
-class HiDreamImageTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(dim, 12 * dim, bias=True, dtype=dtype, device=device)
-        )
-        # nn.init.zeros_(self.adaLN_modulation[1].weight)
-        # nn.init.zeros_(self.adaLN_modulation[1].bias)
-
-        # 1. Attention
-        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.norm1_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.attn1 = HiDreamAttention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            processor = HiDreamAttnProcessor_flashattn(),
-            single = False,
-            dtype=dtype, device=device, operations=operations
-        )
-
-        # 3. Feed-forward
-        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        if num_routed_experts > 0:
-            self.ff_i = MOEFeedForwardSwiGLU(
-                dim = dim,
-                hidden_dim = 4 * dim,
-                num_routed_experts = num_routed_experts,
-                num_activated_experts = num_activated_experts,
-                dtype=dtype, device=device, operations=operations
-            )
-        else:
-            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-        self.norm3_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False)
-        self.ff_t = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.FloatTensor:
-        wtype = image_tokens.dtype
-        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
-        shift_msa_t, scale_msa_t, gate_msa_t, shift_mlp_t, scale_mlp_t, gate_mlp_t = \
-            self.adaLN_modulation(adaln_input)[:,None].chunk(12, dim=-1)
-
-        # 1. MM-Attention
-        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
-        norm_text_tokens = self.norm1_t(text_tokens).to(dtype=wtype)
-        norm_text_tokens = norm_text_tokens * (1 + scale_msa_t) + shift_msa_t
-
-        attn_output_i, attn_output_t = self.attn1(
-            norm_image_tokens,
-            image_tokens_masks,
-            norm_text_tokens,
-            rope = rope,
-        )
-
-        image_tokens = gate_msa_i * attn_output_i + image_tokens
-        text_tokens = gate_msa_t * attn_output_t + text_tokens
-
-        # 2. Feed-forward
-        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
-        norm_text_tokens = self.norm3_t(text_tokens).to(dtype=wtype)
-        norm_text_tokens = norm_text_tokens * (1 + scale_mlp_t) + shift_mlp_t
-
-        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens)
-        ff_output_t = gate_mlp_t * self.ff_t(norm_text_tokens)
-        image_tokens = ff_output_i + image_tokens
-        text_tokens = ff_output_t + text_tokens
-        return image_tokens, text_tokens
-
-
-class HiDreamImageBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        block_type: BlockType = BlockType.TransformerBlock,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        block_classes = {
-            BlockType.TransformerBlock: HiDreamImageTransformerBlock,
-            BlockType.SingleTransformerBlock: HiDreamImageSingleTransformerBlock,
-        }
-        self.block = block_classes[block_type](
-            dim,
-            num_attention_heads,
-            attention_head_dim,
-            num_routed_experts,
-            num_activated_experts,
-            dtype=dtype, device=device, operations=operations
-        )
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: torch.FloatTensor = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.FloatTensor:
-        return self.block(
-            image_tokens,
-            image_tokens_masks,
-            text_tokens,
-            adaln_input,
-            rope,
-        )
-
-
-class HiDreamImageTransformer2DModel(nn.Module):
-    def __init__(
-        self,
-        patch_size: Optional[int] = None,
-        in_channels: int = 64,
-        out_channels: Optional[int] = None,
-        num_layers: int = 16,
-        num_single_layers: int = 32,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 20,
-        caption_channels: List[int] = None,
-        text_emb_dim: int = 2048,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        axes_dims_rope: Tuple[int, int] = (32, 32),
-        max_resolution: Tuple[int, int] = (128, 128),
-        llama_layers: List[int] = None,
-        image_model=None,
-        dtype=None, device=None, operations=None
-    ):
-        self.patch_size = patch_size
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        self.num_layers = num_layers
-        self.num_single_layers = num_single_layers
-
-        self.gradient_checkpointing = False
-
-        super().__init__()
-        self.dtype = dtype
-        self.out_channels = out_channels or in_channels
-        self.inner_dim = self.num_attention_heads * self.attention_head_dim
-        self.llama_layers = llama_layers
-
-        self.t_embedder = TimestepEmbed(self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.p_embedder = PooledEmbed(text_emb_dim, self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.x_embedder = PatchEmbed(
-            patch_size = patch_size,
-            in_channels = in_channels,
-            out_channels = self.inner_dim,
-            dtype=dtype, device=device, operations=operations
-        )
-        self.pe_embedder = EmbedND(theta=10000, axes_dim=axes_dims_rope)
-
-        self.double_stream_blocks = nn.ModuleList(
-            [
-                HiDreamImageBlock(
-                    dim = self.inner_dim,
-                    num_attention_heads = self.num_attention_heads,
-                    attention_head_dim = self.attention_head_dim,
-                    num_routed_experts = num_routed_experts,
-                    num_activated_experts = num_activated_experts,
-                    block_type = BlockType.TransformerBlock,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for i in range(self.num_layers)
-            ]
-        )
-
-        self.single_stream_blocks = nn.ModuleList(
-            [
-                HiDreamImageBlock(
-                    dim = self.inner_dim,
-                    num_attention_heads = self.num_attention_heads,
-                    attention_head_dim = self.attention_head_dim,
-                    num_routed_experts = num_routed_experts,
-                    num_activated_experts = num_activated_experts,
-                    block_type = BlockType.SingleTransformerBlock,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for i in range(self.num_single_layers)
-            ]
-        )
-
-        self.final_layer = LastLayer(self.inner_dim, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
-
-        caption_channels = [caption_channels[1], ] * (num_layers + num_single_layers) + [caption_channels[0], ]
-        caption_projection = []
-        for caption_channel in caption_channels:
-            caption_projection.append(TextProjection(in_features=caption_channel, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations))
-        self.caption_projection = nn.ModuleList(caption_projection)
-        self.max_seq = max_resolution[0] * max_resolution[1] // (patch_size * patch_size)
-
-    def expand_timesteps(self, timesteps, batch_size, device):
-        if not torch.is_tensor(timesteps):
-            is_mps = device.type == "mps"
-            if isinstance(timesteps, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(batch_size)
-        return timesteps
-
-    def unpatchify(self, x: torch.Tensor, img_sizes: List[Tuple[int, int]]) -> List[torch.Tensor]:
-        x_arr = []
-        for i, img_size in enumerate(img_sizes):
-            pH, pW = img_size
-            x_arr.append(
-                einops.rearrange(x[i, :pH*pW].reshape(1, pH, pW, -1), 'B H W (p1 p2 C) -> B C (H p1) (W p2)',
-                    p1=self.patch_size, p2=self.patch_size)
-            )
-        x = torch.cat(x_arr, dim=0)
-        return x
-
-    def patchify(self, x, max_seq, img_sizes=None):
-        pz2 = self.patch_size * self.patch_size
-        if isinstance(x, torch.Tensor):
-            B = x.shape[0]
-            device = x.device
-            dtype = x.dtype
-        else:
-            B = len(x)
-            device = x[0].device
-            dtype = x[0].dtype
-        x_masks = torch.zeros((B, max_seq), dtype=dtype, device=device)
-
-        if img_sizes is not None:
-            for i, img_size in enumerate(img_sizes):
-                x_masks[i, 0:img_size[0] * img_size[1]] = 1
-            x = einops.rearrange(x, 'B C S p -> B S (p C)', p=pz2)
-        elif isinstance(x, torch.Tensor):
-            pH, pW = x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size
-            x = einops.rearrange(x, 'B C (H p1) (W p2) -> B (H W) (p1 p2 C)', p1=self.patch_size, p2=self.patch_size)
-            img_sizes = [[pH, pW]] * B
-            x_masks = None
-        else:
-            raise NotImplementedError
-        return x, x_masks, img_sizes
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        t: torch.Tensor,
-        y: Optional[torch.Tensor] = None,
-        context: Optional[torch.Tensor] = None,
-        encoder_hidden_states_llama3=None,
-        image_cond=None,
-        control = None,
-        transformer_options = {},
-    ) -> torch.Tensor:
-        bs, c, h, w = x.shape
-        if image_cond is not None:
-            x = torch.cat([x, image_cond], dim=-1)
-        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
-        timesteps = t
-        pooled_embeds = y
-        T5_encoder_hidden_states = context
-
-        img_sizes = None
-
-        # spatial forward
-        batch_size = hidden_states.shape[0]
-        hidden_states_type = hidden_states.dtype
-
-        # 0. time
-        timesteps = self.expand_timesteps(timesteps, batch_size, hidden_states.device)
-        timesteps = self.t_embedder(timesteps, hidden_states_type)
-        p_embedder = self.p_embedder(pooled_embeds)
-        adaln_input = timesteps + p_embedder
-
-        hidden_states, image_tokens_masks, img_sizes = self.patchify(hidden_states, self.max_seq, img_sizes)
-        if image_tokens_masks is None:
-            pH, pW = img_sizes[0]
-            img_ids = torch.zeros(pH, pW, 3, device=hidden_states.device)
-            img_ids[..., 1] = img_ids[..., 1] + torch.arange(pH, device=hidden_states.device)[:, None]
-            img_ids[..., 2] = img_ids[..., 2] + torch.arange(pW, device=hidden_states.device)[None, :]
-            img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
-        hidden_states = self.x_embedder(hidden_states)
-
-        # T5_encoder_hidden_states = encoder_hidden_states[0]
-        encoder_hidden_states = encoder_hidden_states_llama3.movedim(1, 0)
-        encoder_hidden_states = [encoder_hidden_states[k] for k in self.llama_layers]
-
-        if self.caption_projection is not None:
-            new_encoder_hidden_states = []
-            for i, enc_hidden_state in enumerate(encoder_hidden_states):
-                enc_hidden_state = self.caption_projection[i](enc_hidden_state)
-                enc_hidden_state = enc_hidden_state.view(batch_size, -1, hidden_states.shape[-1])
-                new_encoder_hidden_states.append(enc_hidden_state)
-            encoder_hidden_states = new_encoder_hidden_states
-            T5_encoder_hidden_states = self.caption_projection[-1](T5_encoder_hidden_states)
-            T5_encoder_hidden_states = T5_encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
-            encoder_hidden_states.append(T5_encoder_hidden_states)
-
-        txt_ids = torch.zeros(
-            batch_size,
-            encoder_hidden_states[-1].shape[1] + encoder_hidden_states[-2].shape[1] + encoder_hidden_states[0].shape[1],
-            3,
-            device=img_ids.device, dtype=img_ids.dtype
-        )
-        ids = torch.cat((img_ids, txt_ids), dim=1)
-        rope = self.pe_embedder(ids)
-
-        # 2. Blocks
-        block_id = 0
-        initial_encoder_hidden_states = torch.cat([encoder_hidden_states[-1], encoder_hidden_states[-2]], dim=1)
-        initial_encoder_hidden_states_seq_len = initial_encoder_hidden_states.shape[1]
-        for bid, block in enumerate(self.double_stream_blocks):
-            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
-            cur_encoder_hidden_states = torch.cat([initial_encoder_hidden_states, cur_llama31_encoder_hidden_states], dim=1)
-            hidden_states, initial_encoder_hidden_states = block(
-                image_tokens = hidden_states,
-                image_tokens_masks = image_tokens_masks,
-                text_tokens = cur_encoder_hidden_states,
-                adaln_input = adaln_input,
-                rope = rope,
-            )
-            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
-            block_id += 1
-
-        image_tokens_seq_len = hidden_states.shape[1]
-        hidden_states = torch.cat([hidden_states, initial_encoder_hidden_states], dim=1)
-        hidden_states_seq_len = hidden_states.shape[1]
-        if image_tokens_masks is not None:
-            encoder_attention_mask_ones = torch.ones(
-                (batch_size, initial_encoder_hidden_states.shape[1] + cur_llama31_encoder_hidden_states.shape[1]),
-                device=image_tokens_masks.device, dtype=image_tokens_masks.dtype
-            )
-            image_tokens_masks = torch.cat([image_tokens_masks, encoder_attention_mask_ones], dim=1)
-
-        for bid, block in enumerate(self.single_stream_blocks):
-            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
-            hidden_states = torch.cat([hidden_states, cur_llama31_encoder_hidden_states], dim=1)
-            hidden_states = block(
-                image_tokens=hidden_states,
-                image_tokens_masks=image_tokens_masks,
-                text_tokens=None,
-                adaln_input=adaln_input,
-                rope=rope,
-            )
-            hidden_states = hidden_states[:, :hidden_states_seq_len]
-            block_id += 1
-
-        hidden_states = hidden_states[:, :image_tokens_seq_len, ...]
-        output = self.final_layer(hidden_states, adaln_input)
-        output = self.unpatchify(output, img_sizes)
-        return -output[:, :, :h, :w]
--- a/comfy/ldm/hunyuan3d/model.py
+++ b/comfy/ldm/hunyuan3d/model.py
@@ -1,135 +0,0 @@
-import torch
-from torch import nn
-from comfy.ldm.flux.layers import (
-    DoubleStreamBlock,
-    LastLayer,
-    MLPEmbedder,
-    SingleStreamBlock,
-    timestep_embedding,
-)
-
-
-class Hunyuan3Dv2(nn.Module):
-    def __init__(
-        self,
-        in_channels=64,
-        context_in_dim=1536,
-        hidden_size=1024,
-        mlp_ratio=4.0,
-        num_heads=16,
-        depth=16,
-        depth_single_blocks=32,
-        qkv_bias=True,
-        guidance_embed=False,
-        image_model=None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.dtype = dtype
-
-        if hidden_size % num_heads != 0:
-            raise ValueError(
-                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
-            )
-
-        self.max_period = 1000  # While reimplementing the model I noticed that they messed up. This 1000 value was meant to be the time_factor but they set the max_period instead
-        self.latent_in = operations.Linear(in_channels, hidden_size, bias=True, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations) if guidance_embed else None
-        )
-        self.cond_in = operations.Linear(context_in_dim, hidden_size, dtype=dtype, device=device)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
-        x = x.movedim(-1, -2)
-        timestep = 1.0 - timestep
-        txt = context
-        img = self.latent_in(x)
-
-        vec = self.time_in(timestep_embedding(timestep, 256, self.max_period).to(dtype=img.dtype))
-        if self.guidance_in is not None:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.max_period).to(img.dtype))
-
-        txt = self.cond_in(txt)
-        pe = None
-        attn_mask = None
-
-        patches_replace = transformer_options.get("patches_replace", {})
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.double_blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"],
-                                                   txt=args["txt"],
-                                                   vec=args["vec"],
-                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("double_block", i)]({"img": img,
-                                                           "txt": txt,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                txt = out["txt"]
-                img = out["img"]
-            else:
-                img, txt = block(img=img,
-                                 txt=txt,
-                                 vec=vec,
-                                 pe=pe,
-                                 attn_mask=attn_mask)
-
-        img = torch.cat((txt, img), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            if ("single_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"],
-                                       vec=args["vec"],
-                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("single_block", i)]({"img": img,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                img = out["img"]
-            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
-
-        img = img[:, txt.shape[1]:, ...]
-        img = self.final_layer(img, vec)
-        return img.movedim(-2, -1) * (-1.0)
--- a/comfy/ldm/hunyuan3d/vae.py
+++ b/comfy/ldm/hunyuan3d/vae.py
@@ -1,587 +0,0 @@
-# Original: https://github.com/Tencent/Hunyuan3D-2/blob/main/hy3dgen/shapegen/models/autoencoders/model.py
-# Since the header on their VAE source file was a bit confusing we asked for permission to use this code from tencent under the GPL license used in ComfyUI.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-from typing import Union, Tuple, List, Callable, Optional
-
-import numpy as np
-from einops import repeat, rearrange
-from tqdm import tqdm
-import logging
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-def generate_dense_grid_points(
-    bbox_min: np.ndarray,
-    bbox_max: np.ndarray,
-    octree_resolution: int,
-    indexing: str = "ij",
-):
-    length = bbox_max - bbox_min
-    num_cells = octree_resolution
-
-    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
-    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
-    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
-    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
-    xyz = np.stack((xs, ys, zs), axis=-1)
-    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
-
-    return xyz, grid_size, length
-
-
-class VanillaVolumeDecoder:
-    @torch.no_grad()
-    def __call__(
-        self,
-        latents: torch.FloatTensor,
-        geo_decoder: Callable,
-        bounds: Union[Tuple[float], List[float], float] = 1.01,
-        num_chunks: int = 10000,
-        octree_resolution: int = None,
-        enable_pbar: bool = True,
-        **kwargs,
-    ):
-        device = latents.device
-        dtype = latents.dtype
-        batch_size = latents.shape[0]
-
-        # 1. generate query points
-        if isinstance(bounds, float):
-            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
-
-        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
-        xyz_samples, grid_size, length = generate_dense_grid_points(
-            bbox_min=bbox_min,
-            bbox_max=bbox_max,
-            octree_resolution=octree_resolution,
-            indexing="ij"
-        )
-        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
-
-        # 2. latents to 3d volume
-        batch_logits = []
-        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding",
-                          disable=not enable_pbar):
-            chunk_queries = xyz_samples[start: start + num_chunks, :]
-            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
-            logits = geo_decoder(queries=chunk_queries, latents=latents)
-            batch_logits.append(logits)
-
-        grid_logits = torch.cat(batch_logits, dim=1)
-        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
-
-        return grid_logits
-
-
-class FourierEmbedder(nn.Module):
-    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
-    each feature dimension of `x[..., i]` into:
-        [
-            sin(x[..., i]),
-            sin(f_1*x[..., i]),
-            sin(f_2*x[..., i]),
-            ...
-            sin(f_N * x[..., i]),
-            cos(x[..., i]),
-            cos(f_1*x[..., i]),
-            cos(f_2*x[..., i]),
-            ...
-            cos(f_N * x[..., i]),
-            x[..., i]     # only present if include_input is True.
-        ], here f_i is the frequency.
-
-    Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
-    If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
-    Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
-
-    Args:
-        num_freqs (int): the number of frequencies, default is 6;
-        logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-            otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
-        input_dim (int): the input dimension, default is 3;
-        include_input (bool): include the input tensor or not, default is True.
-
-    Attributes:
-        frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-                otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
-
-        out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
-            otherwise, it is input_dim * num_freqs * 2.
-
-    """
-
-    def __init__(self,
-                 num_freqs: int = 6,
-                 logspace: bool = True,
-                 input_dim: int = 3,
-                 include_input: bool = True,
-                 include_pi: bool = True) -> None:
-
-        """The initialization"""
-
-        super().__init__()
-
-        if logspace:
-            frequencies = 2.0 ** torch.arange(
-                num_freqs,
-                dtype=torch.float32
-            )
-        else:
-            frequencies = torch.linspace(
-                1.0,
-                2.0 ** (num_freqs - 1),
-                num_freqs,
-                dtype=torch.float32
-            )
-
-        if include_pi:
-            frequencies *= torch.pi
-
-        self.register_buffer("frequencies", frequencies, persistent=False)
-        self.include_input = include_input
-        self.num_freqs = num_freqs
-
-        self.out_dim = self.get_dims(input_dim)
-
-    def get_dims(self, input_dim):
-        temp = 1 if self.include_input or self.num_freqs == 0 else 0
-        out_dim = input_dim * (self.num_freqs * 2 + temp)
-
-        return out_dim
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """ Forward process.
-
-        Args:
-            x: tensor of shape [..., dim]
-
-        Returns:
-            embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
-                where temp is 1 if include_input is True and 0 otherwise.
-        """
-
-        if self.num_freqs > 0:
-            embed = (x[..., None].contiguous() * self.frequencies.to(device=x.device, dtype=x.dtype)).view(*x.shape[:-1], -1)
-            if self.include_input:
-                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
-            else:
-                return torch.cat((embed.sin(), embed.cos()), dim=-1)
-        else:
-            return x
-
-
-class CrossAttentionProcessor:
-    def __call__(self, attn, q, k, v):
-        out = F.scaled_dot_product_attention(q, k, v)
-        return out
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-
-    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x):
-        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-        'survival rate' as the argument.
-
-        """
-        if self.drop_prob == 0. or not self.training:
-            return x
-        keep_prob = 1 - self.drop_prob
-        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-        if keep_prob > 0.0 and self.scale_by_keep:
-            random_tensor.div_(keep_prob)
-        return x * random_tensor
-
-    def extra_repr(self):
-        return f'drop_prob={round(self.drop_prob, 3):0.3f}'
-
-
-class MLP(nn.Module):
-    def __init__(
-        self, *,
-        width: int,
-        expand_ratio: int = 4,
-        output_width: int = None,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.c_fc = ops.Linear(width, width * expand_ratio)
-        self.c_proj = ops.Linear(width * expand_ratio, output_width if output_width is not None else width)
-        self.gelu = nn.GELU()
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
-
-
-class QKVMultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        width=None,
-        qk_norm=False,
-        norm_layer=ops.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-        self.attn_processor = CrossAttentionProcessor()
-
-    def forward(self, q, kv):
-        _, n_ctx, _ = q.shape
-        bs, n_data, width = kv.shape
-        attn_ch = width // self.heads // 2
-        q = q.view(bs, n_ctx, self.heads, -1)
-        kv = kv.view(bs, n_data, self.heads, -1)
-        k, v = torch.split(kv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = self.attn_processor(self, q, k, v)
-        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out
-
-
-class MultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        data_width: Optional[int] = None,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        kv_cache: bool = False,
-    ):
-        super().__init__()
-        self.width = width
-        self.heads = heads
-        self.data_width = width if data_width is None else data_width
-        self.c_q = ops.Linear(width, width, bias=qkv_bias)
-        self.c_kv = ops.Linear(self.data_width, width * 2, bias=qkv_bias)
-        self.c_proj = ops.Linear(width, width)
-        self.attention = QKVMultiheadCrossAttention(
-            heads=heads,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.kv_cache = kv_cache
-        self.data = None
-
-    def forward(self, x, data):
-        x = self.c_q(x)
-        if self.kv_cache:
-            if self.data is None:
-                self.data = self.c_kv(data)
-                logging.info('Save kv cache,this should be called only once for one mesh')
-            data = self.data
-        else:
-            data = self.c_kv(data)
-        x = self.attention(x, data)
-        x = self.c_proj(x)
-        return x
-
-
-class ResidualCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        mlp_expand_ratio: int = 4,
-        data_width: Optional[int] = None,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False
-    ):
-        super().__init__()
-
-        if data_width is None:
-            data_width = width
-
-        self.attn = MultiheadCrossAttention(
-            width=width,
-            heads=heads,
-            data_width=data_width,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
-        self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width, expand_ratio=mlp_expand_ratio)
-
-    def forward(self, x: torch.Tensor, data: torch.Tensor):
-        x = x + self.attn(self.ln_1(x), self.ln_2(data))
-        x = x + self.mlp(self.ln_3(x))
-        return x
-
-
-class QKVMultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        width=None,
-        qk_norm=False,
-        norm_layer=ops.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-    def forward(self, qkv):
-        bs, n_ctx, width = qkv.shape
-        attn_ch = width // self.heads // 3
-        qkv = qkv.view(bs, n_ctx, self.heads, -1)
-        q, k, v = torch.split(qkv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out
-
-
-class MultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.heads = heads
-        self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias)
-        self.c_proj = ops.Linear(width, width)
-        self.attention = QKVMultiheadAttention(
-            heads=heads,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        x = self.c_qkv(x)
-        x = self.attention(x)
-        x = self.drop_path(self.c_proj(x))
-        return x
-
-
-class ResidualAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0,
-    ):
-        super().__init__()
-        self.attn = MultiheadAttention(
-            width=width,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
-        self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-
-    def forward(self, x: torch.Tensor):
-        x = x + self.attn(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        layers: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-        self.resblocks = nn.ModuleList(
-            [
-                ResidualAttentionBlock(
-                    width=width,
-                    heads=heads,
-                    qkv_bias=qkv_bias,
-                    norm_layer=norm_layer,
-                    qk_norm=qk_norm,
-                    drop_path_rate=drop_path_rate
-                )
-                for _ in range(layers)
-            ]
-        )
-
-    def forward(self, x: torch.Tensor):
-        for block in self.resblocks:
-            x = block(x)
-        return x
-
-
-class CrossAttentionDecoder(nn.Module):
-
-    def __init__(
-        self,
-        *,
-        out_channels: int,
-        fourier_embedder: FourierEmbedder,
-        width: int,
-        heads: int,
-        mlp_expand_ratio: int = 4,
-        downsample_ratio: int = 1,
-        enable_ln_post: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary"
-    ):
-        super().__init__()
-
-        self.enable_ln_post = enable_ln_post
-        self.fourier_embedder = fourier_embedder
-        self.downsample_ratio = downsample_ratio
-        self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width)
-        if self.downsample_ratio != 1:
-            self.latents_proj = ops.Linear(width * downsample_ratio, width)
-        if self.enable_ln_post == False:
-            qk_norm = False
-        self.cross_attn_decoder = ResidualCrossAttentionBlock(
-            width=width,
-            mlp_expand_ratio=mlp_expand_ratio,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm
-        )
-
-        if self.enable_ln_post:
-            self.ln_post = ops.LayerNorm(width)
-        self.output_proj = ops.Linear(width, out_channels)
-        self.label_type = label_type
-        self.count = 0
-
-    def forward(self, queries=None, query_embeddings=None, latents=None):
-        if query_embeddings is None:
-            query_embeddings = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
-        self.count += query_embeddings.shape[1]
-        if self.downsample_ratio != 1:
-            latents = self.latents_proj(latents)
-        x = self.cross_attn_decoder(query_embeddings, latents)
-        if self.enable_ln_post:
-            x = self.ln_post(x)
-        occ = self.output_proj(x)
-        return occ
-
-
-class ShapeVAE(nn.Module):
-    def __init__(
-        self,
-        *,
-        embed_dim: int,
-        width: int,
-        heads: int,
-        num_decoder_layers: int,
-        geo_decoder_downsample_ratio: int = 1,
-        geo_decoder_mlp_expand_ratio: int = 4,
-        geo_decoder_ln_post: bool = True,
-        num_freqs: int = 8,
-        include_pi: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary",
-        drop_path_rate: float = 0.0,
-        scale_factor: float = 1.0,
-    ):
-        super().__init__()
-        self.geo_decoder_ln_post = geo_decoder_ln_post
-
-        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
-
-        self.post_kl = ops.Linear(embed_dim, width)
-
-        self.transformer = Transformer(
-            width=width,
-            layers=num_decoder_layers,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-
-        self.geo_decoder = CrossAttentionDecoder(
-            fourier_embedder=self.fourier_embedder,
-            out_channels=1,
-            mlp_expand_ratio=geo_decoder_mlp_expand_ratio,
-            downsample_ratio=geo_decoder_downsample_ratio,
-            enable_ln_post=self.geo_decoder_ln_post,
-            width=width // geo_decoder_downsample_ratio,
-            heads=heads // geo_decoder_downsample_ratio,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            label_type=label_type,
-        )
-
-        self.volume_decoder = VanillaVolumeDecoder()
-        self.scale_factor = scale_factor
-
-    def decode(self, latents, **kwargs):
-        latents = self.post_kl(latents.movedim(-2, -1))
-        latents = self.transformer(latents)
-
-        bounds = kwargs.get("bounds", 1.01)
-        num_chunks = kwargs.get("num_chunks", 8000)
-        octree_resolution = kwargs.get("octree_resolution", 256)
-        enable_pbar = kwargs.get("enable_pbar", True)
-
-        grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar)
-        return grid_logits.movedim(-2, -1)
-
-    def encode(self, x):
-        return None
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -227,7 +227,6 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
-        guiding_frame_index=None,
        control=None,
        transformer_options={},
    ) -> Tensor:
@@ -238,21 +237,12 @@ class HunyuanVideo(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))

-        if guiding_frame_index is not None:
-            token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0))
-            vec_ = self.vector_in(y[:, :self.params.vec_in_dim])
-            vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1)
-            frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2])
-            modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)]
-            modulation_dims_txt = [(0, None, 1)]
-        else:
-            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
-            modulation_dims = None
-            modulation_dims_txt = None
+        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])

        if self.params.guidance_embed:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

        if txt_mask is not None and not torch.is_floating_point(txt_mask):
            txt_mask = (txt_mask - 1).to(img.dtype) * torch.finfo(img.dtype).max
@@ -275,14 +265,14 @@ class HunyuanVideo(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_i = control.get("input")
@@ -297,13 +287,13 @@ class HunyuanVideo(nn.Module):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out

-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_o = control.get("output")
@@ -314,17 +304,17 @@ class HunyuanVideo(nn.Module):

        img = img[:, : img_len]

-        img = self.final_layer(img, vec, modulation_dims=modulation_dims)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)

        shape = initial_shape[-3:]
        for i in range(len(shape)):
            shape[i] = shape[i] // self.patch_size[i]
        img = img.reshape([img.shape[0]] + shape + [self.out_channels] + self.patch_size)
        img = img.permute(0, 4, 1, 5, 2, 6, 3, 7)
-        img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4])
+        img = img.reshape(initial_shape)
        return img

-    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y, guidance, attention_mask=None, control=None, transformer_options={}, **kwargs):
        bs, c, t, h, w = x.shape
        patch_size = self.patch_size
        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
@@ -336,5 +326,5 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, control, transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, control, transformer_options)
        return out
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn

 import comfy.ops
-from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed
+from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
 from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
 from torch.utils import checkpoint

@@ -51,7 +51,7 @@ class HunYuanDiTBlock(nn.Module):
        if norm_type == "layer":
            norm_layer = operations.LayerNorm
        elif norm_type == "rms":
-            norm_layer = operations.RMSNorm
+            norm_layer = RMSNorm
        else:
            raise ValueError(f"Unknown norm_type: {norm_type}")

--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@@ -1,12 +1,13 @@
 import torch
 from torch import nn
 import comfy.ldm.modules.attention
+from comfy.ldm.genmo.joint_model.layers import RMSNorm
 import comfy.ldm.common_dit
 from einops import rearrange
 import math
 from typing import Dict, Optional, Tuple

-from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
+from .symmetric_patchifier import SymmetricPatchifier


 def get_timestep_embedding(
@@ -261,8 +262,8 @@ class CrossAttention(nn.Module):
        self.heads = heads
        self.dim_head = dim_head

-        self.q_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device)
-        self.k_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.q_norm = RMSNorm(inner_dim, dtype=dtype, device=device)
+        self.k_norm = RMSNorm(inner_dim, dtype=dtype, device=device)

        self.to_q = operations.Linear(query_dim, inner_dim, bias=True, dtype=dtype, device=device)
        self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device)
@@ -376,16 +377,12 @@ class LTXVModel(torch.nn.Module):

                 positional_embedding_theta=10000.0,
                 positional_embedding_max_pos=[20, 2048, 2048],
-                 causal_temporal_positioning=False,
-                 vae_scale_factors=(8, 32, 32),
                 dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.generator = None
-        self.vae_scale_factors = vae_scale_factors
        self.dtype = dtype
        self.out_channels = in_channels
        self.inner_dim = num_attention_heads * attention_head_dim
-        self.causal_temporal_positioning = causal_temporal_positioning

        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)

@@ -419,23 +416,42 @@ class LTXVModel(torch.nn.Module):

        self.patchifier = SymmetricPatchifier(1)

-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, guiding_latent_noise_scale=0, transformer_options={}, **kwargs):
        patches_replace = transformer_options.get("patches_replace", {})

+        indices_grid = self.patchifier.get_grid(
+            orig_num_frames=x.shape[2],
+            orig_height=x.shape[3],
+            orig_width=x.shape[4],
+            batch_size=x.shape[0],
+            scale_grid=((1 / frame_rate) * 8, 32, 32),
+            device=x.device,
+        )
+
+        if guiding_latent is not None:
+            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
+            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
+            ts *= input_ts
+            ts[:, :, 0] = guiding_latent_noise_scale * (input_ts[:, :, 0] ** 2)
+            timestep = self.patchifier.patchify(ts)
+            input_x = x.clone()
+            x[:, :, 0] = guiding_latent[:, :, 0]
+            if guiding_latent_noise_scale > 0:
+                if self.generator is None:
+                    self.generator = torch.Generator(device=x.device).manual_seed(42)
+                elif self.generator.device != x.device:
+                    self.generator = torch.Generator(device=x.device).set_state(self.generator.get_state())
+
+                noise_shape = [guiding_latent.shape[0], guiding_latent.shape[1], 1, guiding_latent.shape[3], guiding_latent.shape[4]]
+                scale = guiding_latent_noise_scale * (input_ts ** 2)
+                guiding_noise = scale * torch.randn(size=noise_shape, device=x.device, generator=self.generator)
+
+                x[:, :, 0] = guiding_noise[:, :, 0] + x[:, :, 0] *  (1.0 - scale[:, :, 0])
+
+
        orig_shape = list(x.shape)

-        x, latent_coords = self.patchifier.patchify(x)
-        pixel_coords = latent_to_pixel_coords(
-            latent_coords=latent_coords,
-            scale_factors=self.vae_scale_factors,
-            causal_fix=self.causal_temporal_positioning,
-        )
-
-        if keyframe_idxs is not None:
-            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
-
-        fractional_coords = pixel_coords.to(torch.float32)
-        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
+        x = self.patchifier.patchify(x)

        x = self.patchify_proj(x)
        timestep = timestep * 1000.0
@@ -443,7 +459,7 @@ class LTXVModel(torch.nn.Module):
        if attention_mask is not None and not torch.is_floating_point(attention_mask):
            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max

-        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
+        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)

        batch_size = x.shape[0]
        timestep, embedded_timestep = self.adaln_single(
@@ -503,4 +519,8 @@ class LTXVModel(torch.nn.Module):
            out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
        )

+        if guiding_latent is not None:
+            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
+
+        # print("res", x)
        return x
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@@ -6,29 +6,16 @@ from einops import rearrange
 from torch import Tensor


-def latent_to_pixel_coords(
-    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
-) -> Tensor:
-    """
-    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
-    configuration.
-    Args:
-        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
-        containing the latent corner coordinates of each token.
-        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
-        causal_fix (bool): Whether to take into account the different temporal scale
-            of the first frame. Default = False for backwards compatibility.
-    Returns:
-        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
-    """
-    pixel_coords = (
-        latent_coords
-        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
-    )
-    if causal_fix:
-        # Fix temporal scale for first frame to 1 due to causality
-        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
-    return pixel_coords
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    elif dims_to_append == 0:
+        return x
+    return x[(...,) + (None,) * dims_to_append]


 class Patchifier(ABC):
@@ -57,26 +44,29 @@ class Patchifier(ABC):
    def patch_size(self):
        return self._patch_size

-    def get_latent_coords(
-        self, latent_num_frames, latent_height, latent_width, batch_size, device
+    def get_grid(
+        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
    ):
-        """
-        Return a tensor of shape [batch_size, 3, num_patches] containing the
-            top-left corner latent coordinates of each latent patch.
-        The tensor is repeated for each batch element.
-        """
-        latent_sample_coords = torch.meshgrid(
-            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
-            torch.arange(0, latent_height, self._patch_size[1], device=device),
-            torch.arange(0, latent_width, self._patch_size[2], device=device),
-            indexing="ij",
-        )
-        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
-        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-        latent_coords = rearrange(
-            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
-        )
-        return latent_coords
+        f = orig_num_frames // self._patch_size[0]
+        h = orig_height // self._patch_size[1]
+        w = orig_width // self._patch_size[2]
+        grid_h = torch.arange(h, dtype=torch.float32, device=device)
+        grid_w = torch.arange(w, dtype=torch.float32, device=device)
+        grid_f = torch.arange(f, dtype=torch.float32, device=device)
+        grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing='ij')
+        grid = torch.stack(grid, dim=0)
+        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+
+        if scale_grid is not None:
+            for i in range(3):
+                if isinstance(scale_grid[i], Tensor):
+                    scale = append_dims(scale_grid[i], grid.ndim - 1)
+                else:
+                    scale = scale_grid[i]
+                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
+
+        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
+        return grid


 class SymmetricPatchifier(Patchifier):
@@ -84,8 +74,6 @@ class SymmetricPatchifier(Patchifier):
        self,
        latents: Tensor,
    ) -> Tuple[Tensor, Tensor]:
-        b, _, f, h, w = latents.shape
-        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
        latents = rearrange(
            latents,
            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
@@ -93,7 +81,7 @@ class SymmetricPatchifier(Patchifier):
            p2=self._patch_size[1],
            p3=self._patch_size[2],
        )
-        return latents, latent_coords
+        return latents

    def unpatchify(
        self,
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@@ -15,7 +15,6 @@ class CausalConv3d(nn.Module):
        stride: Union[int, Tuple[int]] = 1,
        dilation: int = 1,
        groups: int = 1,
-        spatial_padding_mode: str = "zeros",
        **kwargs,
    ):
        super().__init__()
@@ -39,7 +38,7 @@ class CausalConv3d(nn.Module):
            stride=stride,
            dilation=dilation,
            padding=padding,
-            padding_mode=spatial_padding_mode,
+            padding_mode="zeros",
            groups=groups,
        )

--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -1,15 +1,13 @@
-from __future__ import annotations
 import torch
 from torch import nn
 from functools import partial
 import math
 from einops import rearrange
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
-
 ops = comfy.ops.disable_weight_init

 class Encoder(nn.Module):
@@ -34,7 +32,7 @@ class Encoder(nn.Module):
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        latent_log_var (`str`, *optional*, defaults to `per_channel`):
-            The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
    """

    def __init__(
@@ -42,13 +40,12 @@ class Encoder(nn.Module):
        dims: Union[int, Tuple[int, int]] = 3,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        norm_num_groups: int = 32,
        patch_size: Union[int, Tuple[int]] = 1,
        norm_layer: str = "group_norm",  # group_norm, pixel_norm
        latent_log_var: str = "per_channel",
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -68,7 +65,6 @@ class Encoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        self.down_blocks = nn.ModuleList([])
@@ -86,7 +82,6 @@ class Encoder(nn.Module):
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -97,7 +92,6 @@ class Encoder(nn.Module):
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = make_conv_nd(
@@ -107,7 +101,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 1, 1),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = make_conv_nd(
@@ -117,7 +110,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(1, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                block = make_conv_nd(
@@ -127,7 +119,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -138,34 +129,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_all_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(2, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_space_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(1, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_time_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(2, 1, 1),
-                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown block: {block_name}")
@@ -189,18 +152,10 @@ class Encoder(nn.Module):
            conv_out_channels *= 2
        elif latent_log_var == "uniform":
            conv_out_channels += 1
-        elif latent_log_var == "constant":
-            conv_out_channels += 1
        elif latent_log_var != "none":
            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
        self.conv_out = make_conv_nd(
-            dims,
-            output_channel,
-            conv_out_channels,
-            3,
-            padding=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False
@@ -242,15 +197,6 @@ class Encoder(nn.Module):
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            else:
                raise ValueError(f"Invalid input shape: {sample.shape}")
-        elif self.latent_log_var == "constant":
-            sample = sample[:, :-1, ...]
-            approx_ln_0 = (
-                -30
-            )  # this is the minimal clamp value in DiagonalGaussianDistribution objects
-            sample = torch.cat(
-                [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
-                dim=1,
-            )

        return sample

@@ -285,7 +231,7 @@ class Decoder(nn.Module):
        dims,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        layers_per_block: int = 2,
        norm_num_groups: int = 32,
@@ -293,7 +239,6 @@ class Decoder(nn.Module):
        norm_layer: str = "group_norm",
        causal: bool = True,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -319,7 +264,6 @@ class Decoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        self.up_blocks = nn.ModuleList([])
@@ -339,7 +283,6 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "attn_res_x":
                block = UNetMidBlock3D(
@@ -351,7 +294,6 @@ class Decoder(nn.Module):
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
                    attention_head_dim=block_params["attention_head_dim"],
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
@@ -364,21 +306,14 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=False,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = DepthToSpaceUpsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    stride=(2, 1, 1),
-                    spatial_padding_mode=spatial_padding_mode,
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
                )
            elif block_name == "compress_space":
                block = DepthToSpaceUpsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    stride=(1, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
                )
            elif block_name == "compress_all":
                output_channel = output_channel // block_params.get("multiplier", 1)
@@ -388,7 +323,6 @@ class Decoder(nn.Module):
                    stride=(2, 2, 2),
                    residual=block_params.get("residual", False),
                    out_channels_reduction_factor=block_params.get("multiplier", 1),
-                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown layer: {block_name}")
@@ -406,13 +340,7 @@ class Decoder(nn.Module):

        self.conv_act = nn.SiLU()
        self.conv_out = make_conv_nd(
-            dims,
-            output_channel,
-            out_channels,
-            3,
-            padding=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
+            dims, output_channel, out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False
@@ -505,12 +433,6 @@ class UNetMidBlock3D(nn.Module):
        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
        resnet_groups (`int`, *optional*, defaults to 32):
            The number of groups to use in the group normalization layers of the resnet blocks.
-        norm_layer (`str`, *optional*, defaults to `group_norm`):
-            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
-        inject_noise (`bool`, *optional*, defaults to `False`):
-            Whether to inject noise into the hidden states.
-        timestep_conditioning (`bool`, *optional*, defaults to `False`):
-            Whether to condition the hidden states on the timestep.

    Returns:
        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
@@ -529,7 +451,6 @@ class UNetMidBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        resnet_groups = (
@@ -555,17 +476,13 @@ class UNetMidBlock3D(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=inject_noise,
                    timestep_conditioning=timestep_conditioning,
-                    spatial_padding_mode=spatial_padding_mode,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        causal: bool = True,
-        timestep: Optional[torch.Tensor] = None,
+        self, hidden_states: torch.FloatTensor, causal: bool = True, timestep: Optional[torch.Tensor] = None
    ) -> torch.FloatTensor:
        timestep_embed = None
        if self.timestep_conditioning:
@@ -590,62 +507,9 @@ class UNetMidBlock3D(nn.Module):
        return hidden_states


-class SpaceToDepthDownsample(nn.Module):
-    def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
-        super().__init__()
-        self.stride = stride
-        self.group_size = in_channels * math.prod(stride) // out_channels
-        self.conv = make_conv_nd(
-            dims=dims,
-            in_channels=in_channels,
-            out_channels=out_channels // math.prod(stride),
-            kernel_size=3,
-            stride=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
-        )
-
-    def forward(self, x, causal: bool = True):
-        if self.stride[0] == 2:
-            x = torch.cat(
-                [x[:, :, :1, :, :], x], dim=2
-            )  # duplicate first frames for padding
-
-        # skip connection
-        x_in = rearrange(
-            x,
-            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-        x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
-        x_in = x_in.mean(dim=2)
-
-        # conv
-        x = self.conv(x, causal=causal)
-        x = rearrange(
-            x,
-            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-
-        x = x + x_in
-
-        return x
-
-
 class DepthToSpaceUpsample(nn.Module):
    def __init__(
-        self,
-        dims,
-        in_channels,
-        stride,
-        residual=False,
-        out_channels_reduction_factor=1,
-        spatial_padding_mode="zeros",
+        self, dims, in_channels, stride, residual=False, out_channels_reduction_factor=1
    ):
        super().__init__()
        self.stride = stride
@@ -659,7 +523,6 @@ class DepthToSpaceUpsample(nn.Module):
            kernel_size=3,
            stride=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )
        self.residual = residual
        self.out_channels_reduction_factor = out_channels_reduction_factor
@@ -695,7 +558,7 @@ class DepthToSpaceUpsample(nn.Module):
 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
        super().__init__()
-        self.norm = ops.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)

    def forward(self, x):
        x = rearrange(x, "b c d h w -> b d h w c")
@@ -728,7 +591,6 @@ class ResnetBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.in_channels = in_channels
@@ -755,7 +617,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        if inject_noise:
@@ -780,7 +641,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        if inject_noise:
@@ -941,44 +801,9 @@ class processor(nn.Module):
        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)

 class VideoVAE(nn.Module):
-    def __init__(self, version=0, config=None):
+    def __init__(self, version=0):
        super().__init__()

-        if config is None:
-            config = self.guess_config(version)
-
-        self.timestep_conditioning = config.get("timestep_conditioning", False)
-        double_z = config.get("double_z", True)
-        latent_log_var = config.get(
-            "latent_log_var", "per_channel" if double_z else "none"
-        )
-
-        self.encoder = Encoder(
-            dims=config["dims"],
-            in_channels=config.get("in_channels", 3),
-            out_channels=config["latent_channels"],
-            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
-            patch_size=config.get("patch_size", 1),
-            latent_log_var=latent_log_var,
-            norm_layer=config.get("norm_layer", "group_norm"),
-            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
-        )
-
-        self.decoder = Decoder(
-            dims=config["dims"],
-            in_channels=config["latent_channels"],
-            out_channels=config.get("out_channels", 3),
-            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
-            patch_size=config.get("patch_size", 1),
-            norm_layer=config.get("norm_layer", "group_norm"),
-            causal=config.get("causal_decoder", False),
-            timestep_conditioning=self.timestep_conditioning,
-            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
-        )
-
-        self.per_channel_statistics = processor()
-
-    def guess_config(self, version):
        if version == 0:
            config = {
                "_class_name": "CausalVideoAutoencoder",
@@ -1005,7 +830,7 @@ class VideoVAE(nn.Module):
                "use_quant_conv": False,
                "causal_decoder": False,
            }
-        elif version == 1:
+        else:
            config = {
                "_class_name": "CausalVideoAutoencoder",
                "dims": 3,
@@ -1041,47 +866,37 @@ class VideoVAE(nn.Module):
                "causal_decoder": False,
                "timestep_conditioning": True,
            }
-        else:
-            config = {
-                "_class_name": "CausalVideoAutoencoder",
-                "dims": 3,
-                "in_channels": 3,
-                "out_channels": 3,
-                "latent_channels": 128,
-                "encoder_blocks": [
-                    ["res_x", {"num_layers": 4}],
-                    ["compress_space_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 6}],
-                    ["compress_time_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 6}],
-                    ["compress_all_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 2}],
-                    ["compress_all_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 2}]
-                ],
-                "decoder_blocks": [
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}]
-                ],
-                "scaling_factor": 1.0,
-                "norm_layer": "pixel_norm",
-                "patch_size": 4,
-                "latent_log_var": "uniform",
-                "use_quant_conv": False,
-                "causal_decoder": False,
-                "timestep_conditioning": True
-            }
-        return config
+
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+
+        self.encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+        )
+
+        self.decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+            timestep_conditioning=config.get("timestep_conditioning", False),
+        )
+
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
+        self.per_channel_statistics = processor()

    def encode(self, x):
-        frames_count = x.shape[2]
-        if ((frames_count - 1) % 8) != 0:
-            raise ValueError("Invalid number of frames: Encode input must have 1 + 8 * x frames (e.g., 1, 9, 17, ...). Please check your input.")
        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
        return self.per_channel_statistics.normalize(means)

--- a/comfy/ldm/lightricks/vae/conv_nd_factory.py
+++ b/comfy/ldm/lightricks/vae/conv_nd_factory.py
@@ -17,11 +17,7 @@ def make_conv_nd(
    groups=1,
    bias=True,
    causal=False,
-    spatial_padding_mode="zeros",
-    temporal_padding_mode="zeros",
 ):
-    if not (spatial_padding_mode == temporal_padding_mode or causal):
-        raise NotImplementedError("spatial and temporal padding modes must be equal")
    if dims == 2:
        return ops.Conv2d(
            in_channels=in_channels,
@@ -32,7 +28,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    elif dims == 3:
        if causal:
@@ -45,7 +40,6 @@ def make_conv_nd(
                dilation=dilation,
                groups=groups,
                bias=bias,
-                spatial_padding_mode=spatial_padding_mode,
            )
        return ops.Conv3d(
            in_channels=in_channels,
@@ -56,7 +50,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    elif dims == (2, 1):
        return DualConv3d(
@@ -66,7 +59,6 @@ def make_conv_nd(
            stride=stride,
            padding=padding,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    else:
        raise ValueError(f"unsupported dimensions: {dims}")
--- a/comfy/ldm/lightricks/vae/dual_conv3d.py
+++ b/comfy/ldm/lightricks/vae/dual_conv3d.py
@@ -18,13 +18,11 @@ class DualConv3d(nn.Module):
        dilation: Union[int, Tuple[int, int, int]] = 1,
        groups=1,
        bias=True,
-        padding_mode="zeros",
    ):
        super(DualConv3d, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
-        self.padding_mode = padding_mode
        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size, kernel_size)
@@ -110,7 +108,6 @@ class DualConv3d(nn.Module):
            self.padding1,
            self.dilation1,
            self.groups,
-            padding_mode=self.padding_mode,
        )

        if skip_time_conv:
@@ -125,7 +122,6 @@ class DualConv3d(nn.Module):
            self.padding2,
            self.dilation2,
            self.groups,
-            padding_mode=self.padding_mode,
        )

        return x
@@ -141,16 +137,7 @@ class DualConv3d(nn.Module):
        stride1 = (self.stride1[1], self.stride1[2])
        padding1 = (self.padding1[1], self.padding1[2])
        dilation1 = (self.dilation1[1], self.dilation1[2])
-        x = F.conv2d(
-            x,
-            weight1,
-            self.bias1,
-            stride1,
-            padding1,
-            dilation1,
-            self.groups,
-            padding_mode=self.padding_mode,
-        )
+        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)

        _, _, h, w = x.shape

@@ -167,16 +154,7 @@ class DualConv3d(nn.Module):
        stride2 = self.stride2[0]
        padding2 = self.padding2[0]
        dilation2 = self.dilation2[0]
-        x = F.conv1d(
-            x,
-            weight2,
-            self.bias2,
-            stride2,
-            padding2,
-            dilation2,
-            self.groups,
-            padding_mode=self.padding_mode,
-        )
+        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)

        return x
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -1,622 +0,0 @@
-# Code from: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
-from __future__ import annotations
-
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import comfy.ldm.common_dit
-
-from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
-from comfy.ldm.modules.attention import optimized_attention_masked
-from comfy.ldm.flux.layers import EmbedND
-
-
-def modulate(x, scale):
-    return x * (1 + scale.unsqueeze(1))
-
-#############################################################################
-#                               Core NextDiT Model                              #
-#############################################################################
-
-
-class JointAttention(nn.Module):
-    """Multi-head attention module."""
-
-    def __init__(
-        self,
-        dim: int,
-        n_heads: int,
-        n_kv_heads: Optional[int],
-        qk_norm: bool,
-        operation_settings={},
-    ):
-        """
-        Initialize the Attention module.
-
-        Args:
-            dim (int): Number of input dimensions.
-            n_heads (int): Number of heads.
-            n_kv_heads (Optional[int]): Number of kv heads, if using GQA.
-
-        """
-        super().__init__()
-        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
-        self.n_local_heads = n_heads
-        self.n_local_kv_heads = self.n_kv_heads
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = dim // n_heads
-
-        self.qkv = operation_settings.get("operations").Linear(
-            dim,
-            (n_heads + self.n_kv_heads + self.n_kv_heads) * self.head_dim,
-            bias=False,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-        self.out = operation_settings.get("operations").Linear(
-            n_heads * self.head_dim,
-            dim,
-            bias=False,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-
-        if qk_norm:
-            self.q_norm = operation_settings.get("operations").RMSNorm(self.head_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-            self.k_norm = operation_settings.get("operations").RMSNorm(self.head_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        else:
-            self.q_norm = self.k_norm = nn.Identity()
-
-    @staticmethod
-    def apply_rotary_emb(
-        x_in: torch.Tensor,
-        freqs_cis: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Apply rotary embeddings to input tensors using the given frequency
-        tensor.
-
-        This function applies rotary embeddings to the given query 'xq' and
-        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
-        input tensors are reshaped as complex numbers, and the frequency tensor
-        is reshaped for broadcasting compatibility. The resulting tensors
-        contain rotary embeddings and are returned as real tensors.
-
-        Args:
-            x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
-            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
-                exponentials.
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
-                and key tensor with rotary embeddings.
-        """
-
-        t_ = x_in.reshape(*x_in.shape[:-1], -1, 1, 2)
-        t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
-        return t_out.reshape(*x_in.shape)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        freqs_cis: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-
-        Args:
-            x:
-            x_mask:
-            freqs_cis:
-
-        Returns:
-
-        """
-        bsz, seqlen, _ = x.shape
-
-        xq, xk, xv = torch.split(
-            self.qkv(x),
-            [
-                self.n_local_heads * self.head_dim,
-                self.n_local_kv_heads * self.head_dim,
-                self.n_local_kv_heads * self.head_dim,
-            ],
-            dim=-1,
-        )
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-
-        xq = self.q_norm(xq)
-        xk = self.k_norm(xk)
-
-        xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
-        xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis)
-
-        n_rep = self.n_local_heads // self.n_local_kv_heads
-        if n_rep >= 1:
-            xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
-            xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
-        output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True)
-
-        return self.out(output)
-
-
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
-        operation_settings={},
-    ):
-        """
-        Initialize the FeedForward module.
-
-        Args:
-            dim (int): Input dimension.
-            hidden_dim (int): Hidden dimension of the feedforward layer.
-            multiple_of (int): Value to ensure hidden dimension is a multiple
-                of this value.
-            ffn_dim_multiplier (float, optional): Custom multiplier for hidden
-                dimension. Defaults to None.
-
-        """
-        super().__init__()
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = operation_settings.get("operations").Linear(
-            dim,
-            hidden_dim,
-            bias=False,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-        self.w2 = operation_settings.get("operations").Linear(
-            hidden_dim,
-            dim,
-            bias=False,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-        self.w3 = operation_settings.get("operations").Linear(
-            dim,
-            hidden_dim,
-            bias=False,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-
-    # @torch.compile
-    def _forward_silu_gating(self, x1, x3):
-        return F.silu(x1) * x3
-
-    def forward(self, x):
-        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
-
-
-class JointTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        layer_id: int,
-        dim: int,
-        n_heads: int,
-        n_kv_heads: int,
-        multiple_of: int,
-        ffn_dim_multiplier: float,
-        norm_eps: float,
-        qk_norm: bool,
-        modulation=True,
-        operation_settings={},
-    ) -> None:
-        """
-        Initialize a TransformerBlock.
-
-        Args:
-            layer_id (int): Identifier for the layer.
-            dim (int): Embedding dimension of the input features.
-            n_heads (int): Number of attention heads.
-            n_kv_heads (Optional[int]): Number of attention heads in key and
-                value features (if using GQA), or set to None for the same as
-                query.
-            multiple_of (int):
-            ffn_dim_multiplier (float):
-            norm_eps (float):
-
-        """
-        super().__init__()
-        self.dim = dim
-        self.head_dim = dim // n_heads
-        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, operation_settings=operation_settings)
-        self.feed_forward = FeedForward(
-            dim=dim,
-            hidden_dim=4 * dim,
-            multiple_of=multiple_of,
-            ffn_dim_multiplier=ffn_dim_multiplier,
-            operation_settings=operation_settings,
-        )
-        self.layer_id = layer_id
-        self.attention_norm1 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.ffn_norm1 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-        self.attention_norm2 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.ffn_norm2 = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-        self.modulation = modulation
-        if modulation:
-            self.adaLN_modulation = nn.Sequential(
-                nn.SiLU(),
-                operation_settings.get("operations").Linear(
-                    min(dim, 1024),
-                    4 * dim,
-                    bias=True,
-                    device=operation_settings.get("device"),
-                    dtype=operation_settings.get("dtype"),
-                ),
-            )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        freqs_cis: torch.Tensor,
-        adaln_input: Optional[torch.Tensor]=None,
-    ):
-        """
-        Perform a forward pass through the TransformerBlock.
-
-        Args:
-            x (torch.Tensor): Input tensor.
-            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
-
-        Returns:
-            torch.Tensor: Output tensor after applying attention and
-                feedforward layers.
-
-        """
-        if self.modulation:
-            assert adaln_input is not None
-            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
-
-            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
-                self.attention(
-                    modulate(self.attention_norm1(x), scale_msa),
-                    x_mask,
-                    freqs_cis,
-                )
-            )
-            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
-                self.feed_forward(
-                    modulate(self.ffn_norm1(x), scale_mlp),
-                )
-            )
-        else:
-            assert adaln_input is None
-            x = x + self.attention_norm2(
-                self.attention(
-                    self.attention_norm1(x),
-                    x_mask,
-                    freqs_cis,
-                )
-            )
-            x = x + self.ffn_norm2(
-                self.feed_forward(
-                    self.ffn_norm1(x),
-                )
-            )
-        return x
-
-
-class FinalLayer(nn.Module):
-    """
-    The final layer of NextDiT.
-    """
-
-    def __init__(self, hidden_size, patch_size, out_channels, operation_settings={}):
-        super().__init__()
-        self.norm_final = operation_settings.get("operations").LayerNorm(
-            hidden_size,
-            elementwise_affine=False,
-            eps=1e-6,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-        self.linear = operation_settings.get("operations").Linear(
-            hidden_size,
-            patch_size * patch_size * out_channels,
-            bias=True,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operation_settings.get("operations").Linear(
-                min(hidden_size, 1024),
-                hidden_size,
-                bias=True,
-                device=operation_settings.get("device"),
-                dtype=operation_settings.get("dtype"),
-            ),
-        )
-
-    def forward(self, x, c):
-        scale = self.adaLN_modulation(c)
-        x = modulate(self.norm_final(x), scale)
-        x = self.linear(x)
-        return x
-
-
-class NextDiT(nn.Module):
-    """
-    Diffusion model with a Transformer backbone.
-    """
-
-    def __init__(
-        self,
-        patch_size: int = 2,
-        in_channels: int = 4,
-        dim: int = 4096,
-        n_layers: int = 32,
-        n_refiner_layers: int = 2,
-        n_heads: int = 32,
-        n_kv_heads: Optional[int] = None,
-        multiple_of: int = 256,
-        ffn_dim_multiplier: Optional[float] = None,
-        norm_eps: float = 1e-5,
-        qk_norm: bool = False,
-        cap_feat_dim: int = 5120,
-        axes_dims: List[int] = (16, 56, 56),
-        axes_lens: List[int] = (1, 512, 512),
-        image_model=None,
-        device=None,
-        dtype=None,
-        operations=None,
-    ) -> None:
-        super().__init__()
-        self.dtype = dtype
-        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
-        self.in_channels = in_channels
-        self.out_channels = in_channels
-        self.patch_size = patch_size
-
-        self.x_embedder = operation_settings.get("operations").Linear(
-            in_features=patch_size * patch_size * in_channels,
-            out_features=dim,
-            bias=True,
-            device=operation_settings.get("device"),
-            dtype=operation_settings.get("dtype"),
-        )
-
-        self.noise_refiner = nn.ModuleList(
-            [
-                JointTransformerBlock(
-                    layer_id,
-                    dim,
-                    n_heads,
-                    n_kv_heads,
-                    multiple_of,
-                    ffn_dim_multiplier,
-                    norm_eps,
-                    qk_norm,
-                    modulation=True,
-                    operation_settings=operation_settings,
-                )
-                for layer_id in range(n_refiner_layers)
-            ]
-        )
-        self.context_refiner = nn.ModuleList(
-            [
-                JointTransformerBlock(
-                    layer_id,
-                    dim,
-                    n_heads,
-                    n_kv_heads,
-                    multiple_of,
-                    ffn_dim_multiplier,
-                    norm_eps,
-                    qk_norm,
-                    modulation=False,
-                    operation_settings=operation_settings,
-                )
-                for layer_id in range(n_refiner_layers)
-            ]
-        )
-
-        self.t_embedder = TimestepEmbedder(min(dim, 1024), **operation_settings)
-        self.cap_embedder = nn.Sequential(
-            operation_settings.get("operations").RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
-            operation_settings.get("operations").Linear(
-                cap_feat_dim,
-                dim,
-                bias=True,
-                device=operation_settings.get("device"),
-                dtype=operation_settings.get("dtype"),
-            ),
-        )
-
-        self.layers = nn.ModuleList(
-            [
-                JointTransformerBlock(
-                    layer_id,
-                    dim,
-                    n_heads,
-                    n_kv_heads,
-                    multiple_of,
-                    ffn_dim_multiplier,
-                    norm_eps,
-                    qk_norm,
-                    operation_settings=operation_settings,
-                )
-                for layer_id in range(n_layers)
-            ]
-        )
-        self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, operation_settings=operation_settings)
-
-        assert (dim // n_heads) == sum(axes_dims)
-        self.axes_dims = axes_dims
-        self.axes_lens = axes_lens
-        self.rope_embedder = EmbedND(dim=dim // n_heads, theta=10000.0, axes_dim=axes_dims)
-        self.dim = dim
-        self.n_heads = n_heads
-
-    def unpatchify(
-        self, x: torch.Tensor, img_size: List[Tuple[int, int]], cap_size: List[int], return_tensor=False
-    ) -> List[torch.Tensor]:
-        """
-        x: (N, T, patch_size**2 * C)
-        imgs: (N, H, W, C)
-        """
-        pH = pW = self.patch_size
-        imgs = []
-        for i in range(x.size(0)):
-            H, W = img_size[i]
-            begin = cap_size[i]
-            end = begin + (H // pH) * (W // pW)
-            imgs.append(
-                x[i][begin:end]
-                .view(H // pH, W // pW, pH, pW, self.out_channels)
-                .permute(4, 0, 2, 1, 3)
-                .flatten(3, 4)
-                .flatten(1, 2)
-            )
-
-        if return_tensor:
-            imgs = torch.stack(imgs, dim=0)
-        return imgs
-
-    def patchify_and_embed(
-        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
-        bsz = len(x)
-        pH = pW = self.patch_size
-        device = x[0].device
-        dtype = x[0].dtype
-
-        if cap_mask is not None:
-            l_effective_cap_len = cap_mask.sum(dim=1).tolist()
-        else:
-            l_effective_cap_len = [num_tokens] * bsz
-
-        if cap_mask is not None and not torch.is_floating_point(cap_mask):
-            cap_mask = (cap_mask - 1).to(dtype) * torch.finfo(dtype).max
-
-        img_sizes = [(img.size(1), img.size(2)) for img in x]
-        l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes]
-
-        max_seq_len = max(
-            (cap_len+img_len for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len))
-        )
-        max_cap_len = max(l_effective_cap_len)
-        max_img_len = max(l_effective_img_len)
-
-        position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.int32, device=device)
-
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-            H, W = img_sizes[i]
-            H_tokens, W_tokens = H // pH, W // pW
-            assert H_tokens * W_tokens == img_len
-
-            position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device)
-            position_ids[i, cap_len:cap_len+img_len, 0] = cap_len
-            row_ids = torch.arange(H_tokens, dtype=torch.int32, device=device).view(-1, 1).repeat(1, W_tokens).flatten()
-            col_ids = torch.arange(W_tokens, dtype=torch.int32, device=device).view(1, -1).repeat(H_tokens, 1).flatten()
-            position_ids[i, cap_len:cap_len+img_len, 1] = row_ids
-            position_ids[i, cap_len:cap_len+img_len, 2] = col_ids
-
-        freqs_cis = self.rope_embedder(position_ids).movedim(1, 2).to(dtype)
-
-        # build freqs_cis for cap and image individually
-        cap_freqs_cis_shape = list(freqs_cis.shape)
-        # cap_freqs_cis_shape[1] = max_cap_len
-        cap_freqs_cis_shape[1] = cap_feats.shape[1]
-        cap_freqs_cis = torch.zeros(*cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
-
-        img_freqs_cis_shape = list(freqs_cis.shape)
-        img_freqs_cis_shape[1] = max_img_len
-        img_freqs_cis = torch.zeros(*img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
-
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-            cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
-            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len:cap_len+img_len]
-
-        # refine context
-        for layer in self.context_refiner:
-            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)
-
-        # refine image
-        flat_x = []
-        for i in range(bsz):
-            img = x[i]
-            C, H, W = img.size()
-            img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 2, 4, 0).flatten(2).flatten(0, 1)
-            flat_x.append(img)
-        x = flat_x
-        padded_img_embed = torch.zeros(bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype)
-        padded_img_mask = torch.zeros(bsz, max_img_len, dtype=dtype, device=device)
-        for i in range(bsz):
-            padded_img_embed[i, :l_effective_img_len[i]] = x[i]
-            padded_img_mask[i, l_effective_img_len[i]:] = -torch.finfo(dtype).max
-
-        padded_img_embed = self.x_embedder(padded_img_embed)
-        padded_img_mask = padded_img_mask.unsqueeze(1)
-        for layer in self.noise_refiner:
-            padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t)
-
-        if cap_mask is not None:
-            mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device)
-            mask[:, :max_cap_len] = cap_mask[:, :max_cap_len]
-        else:
-            mask = None
-
-        padded_full_embed = torch.zeros(bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype)
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-
-            padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len]
-            padded_full_embed[i, cap_len:cap_len+img_len] = padded_img_embed[i, :img_len]
-
-        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
-
-    # def forward(self, x, t, cap_feats, cap_mask):
-    def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
-        t = 1.0 - timesteps
-        cap_feats = context
-        cap_mask = attention_mask
-        bs, c, h, w = x.shape
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
-        """
-        Forward pass of NextDiT.
-        t: (N,) tensor of diffusion timesteps
-        y: (N,) tensor of text tokens/features
-        """
-
-        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
-        adaln_input = t
-
-        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
-
-        x_is_tensor = isinstance(x, torch.Tensor)
-        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens)
-        freqs_cis = freqs_cis.to(x.device)
-
-        for layer in self.layers:
-            x = layer(x, mask, freqs_cis, adaln_input)
-
-        x = self.final_layer(x, adaln_input)
-        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]
-
-        return -x
-
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -1,6 +1,4 @@
 import math
-import sys
-
 import torch
 import torch.nn.functional as F
 from torch import nn, einsum
@@ -18,18 +16,7 @@ if model_management.xformers_enabled():
    import xformers.ops

 if model_management.sage_attention_enabled():
-    try:
-        from sageattention import sageattn
-    except ModuleNotFoundError:
-        logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention")
-        exit(-1)
-
-if model_management.flash_attention_enabled():
-    try:
-        from flash_attn import flash_attn_func
-    except ModuleNotFoundError:
-        logging.error(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn")
-        exit(-1)
+    from sageattention import sageattn

 from comfy.cli_args import args
 import comfy.ops
@@ -37,24 +24,38 @@ ops = comfy.ops.disable_weight_init

 FORCE_UPCAST_ATTENTION_DTYPE = model_management.force_upcast_attention_dtype()

-def get_attn_precision(attn_precision, current_dtype):
+def get_attn_precision(attn_precision):
    if args.dont_upcast_attention:
        return None
-
-    if FORCE_UPCAST_ATTENTION_DTYPE is not None and current_dtype in FORCE_UPCAST_ATTENTION_DTYPE:
-        return FORCE_UPCAST_ATTENTION_DTYPE[current_dtype]
+    if FORCE_UPCAST_ATTENTION_DTYPE is not None:
+        return FORCE_UPCAST_ATTENTION_DTYPE
    return attn_precision

 def exists(val):
    return val is not None


+def uniq(arr):
+    return{el: True for el in arr}.keys()
+
+
 def default(val, d):
    if exists(val):
        return val
    return d


+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
 # feedforward
 class GEGLU(nn.Module):
    def __init__(self, dim_in, dim_out, dtype=None, device=None, operations=ops):
@@ -89,7 +90,7 @@ def Normalize(in_channels, dtype=None, device=None):
    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)

 def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
-    attn_precision = get_attn_precision(attn_precision, q.dtype)
+    attn_precision = get_attn_precision(attn_precision)

    if skip_reshape:
        b, _, _, dim_head = q.shape
@@ -158,7 +159,7 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape


 def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
-    attn_precision = get_attn_precision(attn_precision, query.dtype)
+    attn_precision = get_attn_precision(attn_precision)

    if skip_reshape:
        b, _, _, dim_head = query.shape
@@ -228,7 +229,7 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
    return hidden_states

 def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
-    attn_precision = get_attn_precision(attn_precision, q.dtype)
+    attn_precision = get_attn_precision(attn_precision)

    if skip_reshape:
        b, _, _, dim_head = q.shape
@@ -471,7 +472,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
 def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
    if skip_reshape:
        b, _, _, dim_head = q.shape
-        tensor_layout = "HND"
+        tensor_layout="HND"
    else:
        b, _, dim_head = q.shape
        dim_head //= heads
@@ -479,7 +480,7 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
            lambda t: t.view(b, -1, heads, dim_head),
            (q, k, v),
        )
-        tensor_layout = "NHD"
+        tensor_layout="NHD"

    if mask is not None:
        # add a batch dimension if there isn't already one
@@ -489,17 +490,7 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
        if mask.ndim == 3:
            mask = mask.unsqueeze(1)

-    try:
-        out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
-    except Exception as e:
-        logging.error("Error running sage attention: {}, using pytorch attention instead.".format(e))
-        if tensor_layout == "NHD":
-            q, k, v = map(
-                lambda t: t.transpose(1, 2),
-                (q, k, v),
-            )
-        return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=True, skip_output_reshape=skip_output_reshape)
-
+    out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
    if tensor_layout == "HND":
        if not skip_output_reshape:
            out = (
@@ -513,63 +504,6 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
    return out


-try:
-    @torch.library.custom_op("flash_attention::flash_attn", mutates_args=())
-    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
-        return flash_attn_func(q, k, v, dropout_p=dropout_p, causal=causal)
-
-
-    @flash_attn_wrapper.register_fake
-    def flash_attn_fake(q, k, v, dropout_p=0.0, causal=False):
-        # Output shape is the same as q
-        return q.new_empty(q.shape)
-except AttributeError as error:
-    FLASH_ATTN_ERROR = error
-
-    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
-        assert False, f"Could not define flash_attn_wrapper: {FLASH_ATTN_ERROR}"
-
-
-def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
-    if skip_reshape:
-        b, _, _, dim_head = q.shape
-    else:
-        b, _, dim_head = q.shape
-        dim_head //= heads
-        q, k, v = map(
-            lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2),
-            (q, k, v),
-        )
-
-    if mask is not None:
-        # add a batch dimension if there isn't already one
-        if mask.ndim == 2:
-            mask = mask.unsqueeze(0)
-        # add a heads dimension if there isn't already one
-        if mask.ndim == 3:
-            mask = mask.unsqueeze(1)
-
-    try:
-        assert mask is None
-        out = flash_attn_wrapper(
-            q.transpose(1, 2),
-            k.transpose(1, 2),
-            v.transpose(1, 2),
-            dropout_p=0.0,
-            causal=False,
-        ).transpose(1, 2)
-    except Exception as e:
-        logging.warning(f"Flash Attention failed, using default SDPA: {e}")
-        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
-    if not skip_output_reshape:
-        out = (
-            out.transpose(1, 2).reshape(b, -1, heads * dim_head)
-        )
-    return out
-
-
 optimized_attention = attention_basic

 if model_management.sage_attention_enabled():
@@ -578,9 +512,6 @@ if model_management.sage_attention_enabled():
 elif model_management.xformers_enabled():
    logging.info("Using xformers attention")
    optimized_attention = attention_xformers
-elif model_management.flash_attention_enabled():
-    logging.info("Using Flash Attention")
-    optimized_attention = attention_flash
 elif model_management.pytorch_attention_enabled():
    logging.info("Using pytorch attention")
    optimized_attention = attention_pytorch
@@ -847,7 +778,6 @@ class SpatialTransformer(nn.Module):
        if not isinstance(context, list):
            context = [context] * len(self.transformer_blocks)
        b, c, h, w = x.shape
-        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        x = self.norm(x)
        if not self.use_linear:
@@ -963,7 +893,6 @@ class SpatialVideoTransformer(SpatialTransformer):
        transformer_options={}
    ) -> torch.Tensor:
        _, _, h, w = x.shape
-        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        spatial_context = None
        if exists(context):
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -321,7 +321,7 @@ class SelfAttention(nn.Module):

 class RMSNorm(torch.nn.Module):
    def __init__(
-        self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6, device=None, dtype=None, **kwargs
+        self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6, device=None, dtype=None
    ):
        """
        Initialize the RMSNorm normalization layer.
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@@ -293,17 +293,6 @@ def pytorch_attention(q, k, v):
    return out


-def vae_attention():
-    if model_management.xformers_enabled_vae():
-        logging.info("Using xformers attention in VAE")
-        return xformers_attention
-    elif model_management.pytorch_attention_enabled_vae():
-        logging.info("Using pytorch attention in VAE")
-        return pytorch_attention
-    else:
-        logging.info("Using split attention in VAE")
-        return normal_attention
-
 class AttnBlock(nn.Module):
    def __init__(self, in_channels, conv_op=ops.Conv2d):
        super().__init__()
@@ -331,7 +320,15 @@ class AttnBlock(nn.Module):
                                        stride=1,
                                        padding=0)

-        self.optimized_attention = vae_attention()
+        if model_management.xformers_enabled_vae():
+            logging.info("Using xformers attention in VAE")
+            self.optimized_attention = xformers_attention
+        elif model_management.pytorch_attention_enabled():
+            logging.info("Using pytorch attention in VAE")
+            self.optimized_attention = pytorch_attention
+        else:
+            logging.info("Using split attention in VAE")
+            self.optimized_attention = normal_attention

    def forward(self, x):
        h_ = x
@@ -702,6 +699,9 @@ class Decoder(nn.Module):
                                        padding=1)

    def forward(self, z, **kwargs):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
        # timestep embedding
        temb = None

--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -1,639 +0,0 @@
-# original version: https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-
-import torch
-import torch.nn as nn
-from einops import repeat
-
-from comfy.ldm.modules.attention import optimized_attention
-from comfy.ldm.flux.layers import EmbedND
-from comfy.ldm.flux.math import apply_rope
-import comfy.ldm.common_dit
-import comfy.model_management
-
-
-def sinusoidal_embedding_1d(dim, position):
-    # preprocess
-    assert dim % 2 == 0
-    half = dim // 2
-    position = position.type(torch.float32)
-
-    # calculation
-    sinusoid = torch.outer(
-        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
-    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
-    return x
-
-
-class WanSelfAttention(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 eps=1e-6, operation_settings={}):
-        assert dim % num_heads == 0
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.eps = eps
-
-        # layers
-        self.q = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.k = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.v = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.o = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.norm_q = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
-        self.norm_k = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
-
-    def forward(self, x, freqs):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, num_heads, C / num_heads]
-            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
-        """
-        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-
-        # query, key, value function
-        def qkv_fn(x):
-            q = self.norm_q(self.q(x)).view(b, s, n, d)
-            k = self.norm_k(self.k(x)).view(b, s, n, d)
-            v = self.v(x).view(b, s, n * d)
-            return q, k, v
-
-        q, k, v = qkv_fn(x)
-        q, k = apply_rope(q, k, freqs)
-
-        x = optimized_attention(
-            q.view(b, s, n * d),
-            k.view(b, s, n * d),
-            v,
-            heads=self.num_heads,
-        )
-
-        x = self.o(x)
-        return x
-
-
-class WanT2VCrossAttention(WanSelfAttention):
-
-    def forward(self, x, context, **kwargs):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            context(Tensor): Shape [B, L2, C]
-        """
-        # compute query, key, value
-        q = self.norm_q(self.q(x))
-        k = self.norm_k(self.k(context))
-        v = self.v(context)
-
-        # compute attention
-        x = optimized_attention(q, k, v, heads=self.num_heads)
-
-        x = self.o(x)
-        return x
-
-
-class WanI2VCrossAttention(WanSelfAttention):
-
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 eps=1e-6, operation_settings={}):
-        super().__init__(dim, num_heads, window_size, qk_norm, eps, operation_settings=operation_settings)
-
-        self.k_img = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.v_img = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        # self.alpha = nn.Parameter(torch.zeros((1, )))
-        self.norm_k_img = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
-
-    def forward(self, x, context, context_img_len):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            context(Tensor): Shape [B, L2, C]
-        """
-        context_img = context[:, :context_img_len]
-        context = context[:, context_img_len:]
-
-        # compute query, key, value
-        q = self.norm_q(self.q(x))
-        k = self.norm_k(self.k(context))
-        v = self.v(context)
-        k_img = self.norm_k_img(self.k_img(context_img))
-        v_img = self.v_img(context_img)
-        img_x = optimized_attention(q, k_img, v_img, heads=self.num_heads)
-        # compute attention
-        x = optimized_attention(q, k, v, heads=self.num_heads)
-
-        # output
-        x = x + img_x
-        x = self.o(x)
-        return x
-
-
-WAN_CROSSATTENTION_CLASSES = {
-    't2v_cross_attn': WanT2VCrossAttention,
-    'i2v_cross_attn': WanI2VCrossAttention,
-}
-
-
-class WanAttentionBlock(nn.Module):
-
-    def __init__(self,
-                 cross_attn_type,
-                 dim,
-                 ffn_dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=False,
-                 eps=1e-6, operation_settings={}):
-        super().__init__()
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-
-        # layers
-        self.norm1 = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm,
-                                          eps, operation_settings=operation_settings)
-        self.norm3 = operation_settings.get("operations").LayerNorm(
-            dim, eps,
-            elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if cross_attn_norm else nn.Identity()
-        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim,
-                                                                      num_heads,
-                                                                      (-1, -1),
-                                                                      qk_norm,
-                                                                      eps, operation_settings=operation_settings)
-        self.norm2 = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.ffn = nn.Sequential(
-            operation_settings.get("operations").Linear(dim, ffn_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), nn.GELU(approximate='tanh'),
-            operation_settings.get("operations").Linear(ffn_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-
-        # modulation
-        self.modulation = nn.Parameter(torch.empty(1, 6, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-
-    def forward(
-        self,
-        x,
-        e,
-        freqs,
-        context,
-        context_img_len=257,
-    ):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, C]
-            e(Tensor): Shape [B, 6, C]
-            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
-        """
-        # assert e.dtype == torch.float32
-
-        e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
-        # assert e[0].dtype == torch.float32
-
-        # self-attention
-        y = self.self_attn(
-            self.norm1(x) * (1 + e[1]) + e[0],
-            freqs)
-
-        x = x + y * e[2]
-
-        # cross-attention & ffn
-        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len)
-        y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3])
-        x = x + y * e[5]
-        return x
-
-
-class VaceWanAttentionBlock(WanAttentionBlock):
-    def __init__(
-            self,
-            cross_attn_type,
-            dim,
-            ffn_dim,
-            num_heads,
-            window_size=(-1, -1),
-            qk_norm=True,
-            cross_attn_norm=False,
-            eps=1e-6,
-            block_id=0,
-            operation_settings={}
-    ):
-        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
-        self.block_id = block_id
-        if block_id == 0:
-            self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-    def forward(self, c, x, **kwargs):
-        if self.block_id == 0:
-            c = self.before_proj(c) + x
-        c = super().forward(c, **kwargs)
-        c_skip = self.after_proj(c)
-        return c_skip, c
-
-
-class Head(nn.Module):
-
-    def __init__(self, dim, out_dim, patch_size, eps=1e-6, operation_settings={}):
-        super().__init__()
-        self.dim = dim
-        self.out_dim = out_dim
-        self.patch_size = patch_size
-        self.eps = eps
-
-        # layers
-        out_dim = math.prod(patch_size) * out_dim
-        self.norm = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.head = operation_settings.get("operations").Linear(dim, out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-
-        # modulation
-        self.modulation = nn.Parameter(torch.empty(1, 2, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-
-    def forward(self, x, e):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            e(Tensor): Shape [B, C]
-        """
-        # assert e.dtype == torch.float32
-        e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1)
-        x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
-        return x
-
-
-class MLPProj(torch.nn.Module):
-
-    def __init__(self, in_dim, out_dim, flf_pos_embed_token_number=None, operation_settings={}):
-        super().__init__()
-
-        self.proj = torch.nn.Sequential(
-            operation_settings.get("operations").LayerNorm(in_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), operation_settings.get("operations").Linear(in_dim, in_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
-            torch.nn.GELU(), operation_settings.get("operations").Linear(in_dim, out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
-            operation_settings.get("operations").LayerNorm(out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-
-        if flf_pos_embed_token_number is not None:
-            self.emb_pos = nn.Parameter(torch.empty((1, flf_pos_embed_token_number, in_dim), device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-        else:
-            self.emb_pos = None
-
-    def forward(self, image_embeds):
-        if self.emb_pos is not None:
-            image_embeds = image_embeds[:, :self.emb_pos.shape[1]] + comfy.model_management.cast_to(self.emb_pos[:, :image_embeds.shape[1]], dtype=image_embeds.dtype, device=image_embeds.device)
-
-        clip_extra_context_tokens = self.proj(image_embeds)
-        return clip_extra_context_tokens
-
-
-class WanModel(torch.nn.Module):
-    r"""
-    Wan diffusion backbone supporting both text-to-video and image-to-video.
-    """
-
-    def __init__(self,
-                 model_type='t2v',
-                 patch_size=(1, 2, 2),
-                 text_len=512,
-                 in_dim=16,
-                 dim=2048,
-                 ffn_dim=8192,
-                 freq_dim=256,
-                 text_dim=4096,
-                 out_dim=16,
-                 num_heads=16,
-                 num_layers=32,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=True,
-                 eps=1e-6,
-                 flf_pos_embed_token_number=None,
-                 image_model=None,
-                 device=None,
-                 dtype=None,
-                 operations=None,
-                 ):
-        r"""
-        Initialize the diffusion model backbone.
-
-        Args:
-            model_type (`str`, *optional*, defaults to 't2v'):
-                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
-            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
-                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
-            text_len (`int`, *optional*, defaults to 512):
-                Fixed length for text embeddings
-            in_dim (`int`, *optional*, defaults to 16):
-                Input video channels (C_in)
-            dim (`int`, *optional*, defaults to 2048):
-                Hidden dimension of the transformer
-            ffn_dim (`int`, *optional*, defaults to 8192):
-                Intermediate dimension in feed-forward network
-            freq_dim (`int`, *optional*, defaults to 256):
-                Dimension for sinusoidal time embeddings
-            text_dim (`int`, *optional*, defaults to 4096):
-                Input dimension for text embeddings
-            out_dim (`int`, *optional*, defaults to 16):
-                Output video channels (C_out)
-            num_heads (`int`, *optional*, defaults to 16):
-                Number of attention heads
-            num_layers (`int`, *optional*, defaults to 32):
-                Number of transformer blocks
-            window_size (`tuple`, *optional*, defaults to (-1, -1)):
-                Window size for local attention (-1 indicates global attention)
-            qk_norm (`bool`, *optional*, defaults to True):
-                Enable query/key normalization
-            cross_attn_norm (`bool`, *optional*, defaults to False):
-                Enable cross-attention normalization
-            eps (`float`, *optional*, defaults to 1e-6):
-                Epsilon value for normalization layers
-        """
-
-        super().__init__()
-        self.dtype = dtype
-        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
-
-        assert model_type in ['t2v', 'i2v']
-        self.model_type = model_type
-
-        self.patch_size = patch_size
-        self.text_len = text_len
-        self.in_dim = in_dim
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.freq_dim = freq_dim
-        self.text_dim = text_dim
-        self.out_dim = out_dim
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-
-        # embeddings
-        self.patch_embedding = operations.Conv3d(
-            in_dim, dim, kernel_size=patch_size, stride=patch_size, device=operation_settings.get("device"), dtype=torch.float32)
-        self.text_embedding = nn.Sequential(
-            operations.Linear(text_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), nn.GELU(approximate='tanh'),
-            operations.Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-
-        self.time_embedding = nn.Sequential(
-            operations.Linear(freq_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), nn.SiLU(), operations.Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-        self.time_projection = nn.Sequential(nn.SiLU(), operations.Linear(dim, dim * 6, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
-
-        # blocks
-        cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
-        self.blocks = nn.ModuleList([
-            WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
-                              window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
-            for _ in range(num_layers)
-        ])
-
-        # head
-        self.head = Head(dim, out_dim, patch_size, eps, operation_settings=operation_settings)
-
-        d = dim // num_heads
-        self.rope_embedder = EmbedND(dim=d, theta=10000.0, axes_dim=[d - 4 * (d // 6), 2 * (d // 6), 2 * (d // 6)])
-
-        if model_type == 'i2v':
-            self.img_emb = MLPProj(1280, dim, flf_pos_embed_token_number=flf_pos_embed_token_number, operation_settings=operation_settings)
-        else:
-            self.img_emb = None
-
-    def forward_orig(
-        self,
-        x,
-        t,
-        context,
-        clip_fea=None,
-        freqs=None,
-        transformer_options={},
-        **kwargs,
-    ):
-        r"""
-        Forward pass through the diffusion model
-
-        Args:
-            x (Tensor):
-                List of input video tensors with shape [B, C_in, F, H, W]
-            t (Tensor):
-                Diffusion timesteps tensor of shape [B]
-            context (List[Tensor]):
-                List of text embeddings each with shape [B, L, C]
-            seq_len (`int`):
-                Maximum sequence length for positional encoding
-            clip_fea (Tensor, *optional*):
-                CLIP image features for image-to-video mode
-            y (List[Tensor], *optional*):
-                Conditional video inputs for image-to-video mode, same shape as x
-
-        Returns:
-            List[Tensor]:
-                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
-        """
-        # embeddings
-        x = self.patch_embedding(x.float()).to(x.dtype)
-        grid_sizes = x.shape[2:]
-        x = x.flatten(2).transpose(1, 2)
-
-        # time embeddings
-        e = self.time_embedding(
-            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
-        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
-
-        # context
-        context = self.text_embedding(context)
-
-        context_img_len = None
-        if clip_fea is not None:
-            if self.img_emb is not None:
-                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
-                context = torch.concat([context_clip, context], dim=1)
-            context_img_len = clip_fea.shape[-2]
-
-        patches_replace = transformer_options.get("patches_replace", {})
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
-                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
-                x = out["img"]
-            else:
-                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
-
-        # head
-        x = self.head(x, e)
-
-        # unpatchify
-        x = self.unpatchify(x, grid_sizes)
-        return x
-
-    def forward(self, x, timestep, context, clip_fea=None, transformer_options={}, **kwargs):
-        bs, c, t, h, w = x.shape
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
-        patch_size = self.patch_size
-        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
-        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
-        w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
-        img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
-        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
-        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
-        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
-
-        freqs = self.rope_embedder(img_ids).movedim(1, 2)
-        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]
-
-    def unpatchify(self, x, grid_sizes):
-        r"""
-        Reconstruct video tensors from patch embeddings.
-
-        Args:
-            x (List[Tensor]):
-                List of patchified features, each with shape [L, C_out * prod(patch_size)]
-            grid_sizes (Tensor):
-                Original spatial-temporal grid dimensions before patching,
-                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
-
-        Returns:
-            List[Tensor]:
-                Reconstructed video tensors with shape [L, C_out, F, H / 8, W / 8]
-        """
-
-        c = self.out_dim
-        u = x
-        b = u.shape[0]
-        u = u[:, :math.prod(grid_sizes)].view(b, *grid_sizes, *self.patch_size, c)
-        u = torch.einsum('bfhwpqrc->bcfphqwr', u)
-        u = u.reshape(b, c, *[i * j for i, j in zip(grid_sizes, self.patch_size)])
-        return u
-
-
-class VaceWanModel(WanModel):
-    r"""
-    Wan diffusion backbone supporting both text-to-video and image-to-video.
-    """
-
-    def __init__(self,
-                 model_type='vace',
-                 patch_size=(1, 2, 2),
-                 text_len=512,
-                 in_dim=16,
-                 dim=2048,
-                 ffn_dim=8192,
-                 freq_dim=256,
-                 text_dim=4096,
-                 out_dim=16,
-                 num_heads=16,
-                 num_layers=32,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=True,
-                 eps=1e-6,
-                 flf_pos_embed_token_number=None,
-                 image_model=None,
-                 vace_layers=None,
-                 vace_in_dim=None,
-                 device=None,
-                 dtype=None,
-                 operations=None,
-                 ):
-
-        super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
-        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
-
-        # Vace
-        if vace_layers is not None:
-            self.vace_layers = vace_layers
-            self.vace_in_dim = vace_in_dim
-            # vace blocks
-            self.vace_blocks = nn.ModuleList([
-                VaceWanAttentionBlock('t2v_cross_attn', self.dim, self.ffn_dim, self.num_heads, self.window_size, self.qk_norm, self.cross_attn_norm, self.eps, block_id=i, operation_settings=operation_settings)
-                for i in range(self.vace_layers)
-            ])
-
-            self.vace_layers_mapping = {i: n for n, i in enumerate(range(0, self.num_layers, self.num_layers // self.vace_layers))}
-            # vace patch embeddings
-            self.vace_patch_embedding = operations.Conv3d(
-                self.vace_in_dim, self.dim, kernel_size=self.patch_size, stride=self.patch_size, device=device, dtype=torch.float32
-            )
-
-    def forward_orig(
-        self,
-        x,
-        t,
-        context,
-        vace_context,
-        vace_strength=1.0,
-        clip_fea=None,
-        freqs=None,
-        transformer_options={},
-        **kwargs,
-    ):
-        # embeddings
-        x = self.patch_embedding(x.float()).to(x.dtype)
-        grid_sizes = x.shape[2:]
-        x = x.flatten(2).transpose(1, 2)
-
-        # time embeddings
-        e = self.time_embedding(
-            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
-        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
-
-        # context
-        context = self.text_embedding(context)
-
-        context_img_len = None
-        if clip_fea is not None:
-            if self.img_emb is not None:
-                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
-                context = torch.concat([context_clip, context], dim=1)
-            context_img_len = clip_fea.shape[-2]
-
-        c = self.vace_patch_embedding(vace_context.float()).to(vace_context.dtype)
-        c = c.flatten(2).transpose(1, 2)
-
-        # arguments
-        x_orig = x
-
-        patches_replace = transformer_options.get("patches_replace", {})
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
-                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
-                x = out["img"]
-            else:
-                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
-
-            ii = self.vace_layers_mapping.get(i, None)
-            if ii is not None:
-                c_skip, c = self.vace_blocks[ii](c, x=x_orig, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
-                x += c_skip * vace_strength
-                del c_skip
-        # head
-        x = self.head(x, e)
-
-        # unpatchify
-        x = self.unpatchify(x, grid_sizes)
-        return x
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@@ -1,567 +0,0 @@
-# original version: https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/vae.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from comfy.ldm.modules.diffusionmodules.model import vae_attention
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-CACHE_T = 2
-
-
-class CausalConv3d(ops.Conv3d):
-    """
-    Causal 3d convolusion.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._padding = (self.padding[2], self.padding[2], self.padding[1],
-                         self.padding[1], 2 * self.padding[0], 0)
-        self.padding = (0, 0, 0)
-
-    def forward(self, x, cache_x=None):
-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
-        x = F.pad(x, padding)
-
-        return super().forward(x)
-
-
-class RMS_norm(nn.Module):
-
-    def __init__(self, dim, channel_first=True, images=True, bias=False):
-        super().__init__()
-        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
-        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
-
-        self.channel_first = channel_first
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(shape))
-        self.bias = nn.Parameter(torch.zeros(shape)) if bias else None
-
-    def forward(self, x):
-        return F.normalize(
-            x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma.to(x) + (self.bias.to(x) if self.bias is not None else 0)
-
-
-class Upsample(nn.Upsample):
-
-    def forward(self, x):
-        """
-        Fix bfloat16 support for nearest neighbor interpolation.
-        """
-        return super().forward(x.float()).type_as(x)
-
-
-class Resample(nn.Module):
-
-    def __init__(self, dim, mode):
-        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
-                        'downsample3d')
-        super().__init__()
-        self.dim = dim
-        self.mode = mode
-
-        # layers
-        if mode == 'upsample2d':
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
-                ops.Conv2d(dim, dim // 2, 3, padding=1))
-        elif mode == 'upsample3d':
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
-                ops.Conv2d(dim, dim // 2, 3, padding=1))
-            self.time_conv = CausalConv3d(
-                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
-
-        elif mode == 'downsample2d':
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                ops.Conv2d(dim, dim, 3, stride=(2, 2)))
-        elif mode == 'downsample3d':
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                ops.Conv2d(dim, dim, 3, stride=(2, 2)))
-            self.time_conv = CausalConv3d(
-                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-
-        else:
-            self.resample = nn.Identity()
-
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        b, c, t, h, w = x.size()
-        if self.mode == 'upsample3d':
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = 'Rep'
-                    feat_idx[0] += 1
-                else:
-
-                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                    if cache_x.shape[2] < 2 and feat_cache[
-                            idx] is not None and feat_cache[idx] != 'Rep':
-                        # cache last frame of last two chunk
-                        cache_x = torch.cat([
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device), cache_x
-                        ],
-                                            dim=2)
-                    if cache_x.shape[2] < 2 and feat_cache[
-                            idx] is not None and feat_cache[idx] == 'Rep':
-                        cache_x = torch.cat([
-                            torch.zeros_like(cache_x).to(cache_x.device),
-                            cache_x
-                        ],
-                                            dim=2)
-                    if feat_cache[idx] == 'Rep':
-                        x = self.time_conv(x)
-                    else:
-                        x = self.time_conv(x, feat_cache[idx])
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-
-                    x = x.reshape(b, 2, c, t, h, w)
-                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
-                                    3)
-                    x = x.reshape(b, c, t * 2, h, w)
-        t = x.shape[2]
-        x = rearrange(x, 'b c t h w -> (b t) c h w')
-        x = self.resample(x)
-        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
-
-        if self.mode == 'downsample3d':
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = x.clone()
-                    feat_idx[0] += 1
-                else:
-
-                    cache_x = x[:, :, -1:, :, :].clone()
-                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
-                    #     # cache last frame of last two chunk
-                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-
-                    x = self.time_conv(
-                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-        return x
-
-    def init_weight(self, conv):
-        conv_weight = conv.weight
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        one_matrix = torch.eye(c1, c2)
-        init_matrix = one_matrix
-        nn.init.zeros_(conv_weight)
-        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
-        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
-        conv.weight.data.copy_(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-
-    def init_weight2(self, conv):
-        conv_weight = conv.weight.data
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        init_matrix = torch.eye(c1 // 2, c2)
-        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
-        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
-        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
-        conv.weight.data.copy_(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-
-
-class ResidualBlock(nn.Module):
-
-    def __init__(self, in_dim, out_dim, dropout=0.0):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-
-        # layers
-        self.residual = nn.Sequential(
-            RMS_norm(in_dim, images=False), nn.SiLU(),
-            CausalConv3d(in_dim, out_dim, 3, padding=1),
-            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
-            CausalConv3d(out_dim, out_dim, 3, padding=1))
-        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
-            if in_dim != out_dim else nn.Identity()
-
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        h = self.shortcut(x)
-        for layer in self.residual:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x + h
-
-
-class AttentionBlock(nn.Module):
-    """
-    Causal self-attention with a single head.
-    """
-
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-
-        # layers
-        self.norm = RMS_norm(dim)
-        self.to_qkv = ops.Conv2d(dim, dim * 3, 1)
-        self.proj = ops.Conv2d(dim, dim, 1)
-        self.optimized_attention = vae_attention()
-
-    def forward(self, x):
-        identity = x
-        b, c, t, h, w = x.size()
-        x = rearrange(x, 'b c t h w -> (b t) c h w')
-        x = self.norm(x)
-        # compute query, key, value
-
-        q, k, v = self.to_qkv(x).chunk(3, dim=1)
-        x = self.optimized_attention(q, k, v)
-
-        # output
-        x = self.proj(x)
-        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
-        return x + identity
-
-
-class Encoder3d(nn.Module):
-
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-
-        # dimensions
-        dims = [dim * u for u in [1] + dim_mult]
-        scale = 1.0
-
-        # init block
-        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
-
-        # downsample blocks
-        downsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            for _ in range(num_res_blocks):
-                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-                if scale in attn_scales:
-                    downsamples.append(AttentionBlock(out_dim))
-                in_dim = out_dim
-
-            # downsample block
-            if i != len(dim_mult) - 1:
-                mode = 'downsample3d' if temperal_downsample[
-                    i] else 'downsample2d'
-                downsamples.append(Resample(out_dim, mode=mode))
-                scale /= 2.0
-        self.downsamples = nn.Sequential(*downsamples)
-
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
-            ResidualBlock(out_dim, out_dim, dropout))
-
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False), nn.SiLU(),
-            CausalConv3d(out_dim, z_dim, 3, padding=1))
-
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([
-                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                        cache_x.device), cache_x
-                ],
-                                    dim=2)
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-
-        ## downsamples
-        for layer in self.downsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-
-
-class Decoder3d(nn.Module):
-
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_upsample=[False, True, True],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_upsample = temperal_upsample
-
-        # dimensions
-        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
-        scale = 1.0 / 2**(len(dim_mult) - 2)
-
-        # init block
-        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
-
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
-            ResidualBlock(dims[0], dims[0], dropout))
-
-        # upsample blocks
-        upsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            if i == 1 or i == 2 or i == 3:
-                in_dim = in_dim // 2
-            for _ in range(num_res_blocks + 1):
-                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-                if scale in attn_scales:
-                    upsamples.append(AttentionBlock(out_dim))
-                in_dim = out_dim
-
-            # upsample block
-            if i != len(dim_mult) - 1:
-                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
-                upsamples.append(Resample(out_dim, mode=mode))
-                scale *= 2.0
-        self.upsamples = nn.Sequential(*upsamples)
-
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False), nn.SiLU(),
-            CausalConv3d(out_dim, 3, 3, padding=1))
-
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        ## conv1
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([
-                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                        cache_x.device), cache_x
-                ],
-                                    dim=2)
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-
-        ## upsamples
-        for layer in self.upsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-
-
-def count_conv3d(model):
-    count = 0
-    for m in model.modules():
-        if isinstance(m, CausalConv3d):
-            count += 1
-    return count
-
-
-class WanVAE(nn.Module):
-
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        self.temperal_upsample = temperal_downsample[::-1]
-
-        # modules
-        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_downsample, dropout)
-        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
-        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
-        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_upsample, dropout)
-
-    def forward(self, x):
-        mu, log_var = self.encode(x)
-        z = self.reparameterize(mu, log_var)
-        x_recon = self.decode(z)
-        return x_recon, mu, log_var
-
-    def encode(self, x):
-        self.clear_cache()
-        ## cache
-        t = x.shape[2]
-        iter_ = 1 + (t - 1) // 4
-        ## 对encode输入的x，按时间拆分为1、4、4、4....
-        for i in range(iter_):
-            self._enc_conv_idx = [0]
-            if i == 0:
-                out = self.encoder(
-                    x[:, :, :1, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
-            else:
-                out_ = self.encoder(
-                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
-                out = torch.cat([out, out_], 2)
-        mu, log_var = self.conv1(out).chunk(2, dim=1)
-        self.clear_cache()
-        return mu
-
-    def decode(self, z):
-        self.clear_cache()
-        # z: [b,c,t,h,w]
-
-        iter_ = z.shape[2]
-        x = self.conv2(z)
-        for i in range(iter_):
-            self._conv_idx = [0]
-            if i == 0:
-                out = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
-            else:
-                out_ = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
-                out = torch.cat([out, out_], 2)
-        self.clear_cache()
-        return out
-
-    def reparameterize(self, mu, log_var):
-        std = torch.exp(0.5 * log_var)
-        eps = torch.randn_like(std)
-        return eps * std + mu
-
-    def sample(self, imgs, deterministic=False):
-        mu, log_var = self.encode(imgs)
-        if deterministic:
-            return mu
-        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
-        return mu + std * torch.randn_like(std)
-
-    def clear_cache(self):
-        self._conv_num = count_conv3d(self.decoder)
-        self._conv_idx = [0]
-        self._feat_map = [None] * self._conv_num
-        #cache encode
-        self._enc_conv_num = count_conv3d(self.encoder)
-        self._enc_conv_idx = [0]
-        self._enc_feat_map = [None] * self._enc_conv_num
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -20,7 +20,6 @@ from __future__ import annotations
 import comfy.utils
 import comfy.model_management
 import comfy.model_base
-import comfy.weight_adapter as weight_adapter
 import logging
 import torch

@@ -50,12 +49,139 @@ def load_lora(lora, to_load, log_missing=True):
            dora_scale = lora[dora_scale_name]
            loaded_keys.add(dora_scale_name)

-        for adapter_cls in weight_adapter.adapters:
-            adapter = adapter_cls.load(x, lora, alpha, dora_scale, loaded_keys)
-            if adapter is not None:
-                patch_dict[to_load[x]] = adapter
-                loaded_keys.update(adapter.loaded_keys)
-                continue
+        reshape_name = "{}.reshape_weight".format(x)
+        reshape = None
+        if reshape_name in lora.keys():
+            try:
+                reshape = lora[reshape_name].tolist()
+                loaded_keys.add(reshape_name)
+            except:
+                pass
+
+        regular_lora = "{}.lora_up.weight".format(x)
+        diffusers_lora = "{}_lora.up.weight".format(x)
+        diffusers2_lora = "{}.lora_B.weight".format(x)
+        diffusers3_lora = "{}.lora.up.weight".format(x)
+        mochi_lora = "{}.lora_B".format(x)
+        transformers_lora = "{}.lora_linear_layer.up.weight".format(x)
+        A_name = None
+
+        if regular_lora in lora.keys():
+            A_name = regular_lora
+            B_name = "{}.lora_down.weight".format(x)
+            mid_name = "{}.lora_mid.weight".format(x)
+        elif diffusers_lora in lora.keys():
+            A_name = diffusers_lora
+            B_name = "{}_lora.down.weight".format(x)
+            mid_name = None
+        elif diffusers2_lora in lora.keys():
+            A_name = diffusers2_lora
+            B_name = "{}.lora_A.weight".format(x)
+            mid_name = None
+        elif diffusers3_lora in lora.keys():
+            A_name = diffusers3_lora
+            B_name = "{}.lora.down.weight".format(x)
+            mid_name = None
+        elif mochi_lora in lora.keys():
+            A_name = mochi_lora
+            B_name = "{}.lora_A".format(x)
+            mid_name = None
+        elif transformers_lora in lora.keys():
+            A_name = transformers_lora
+            B_name ="{}.lora_linear_layer.down.weight".format(x)
+            mid_name = None
+
+        if A_name is not None:
+            mid = None
+            if mid_name is not None and mid_name in lora.keys():
+                mid = lora[mid_name]
+                loaded_keys.add(mid_name)
+            patch_dict[to_load[x]] = ("lora", (lora[A_name], lora[B_name], alpha, mid, dora_scale, reshape))
+            loaded_keys.add(A_name)
+            loaded_keys.add(B_name)
+
+
+        ######## loha
+        hada_w1_a_name = "{}.hada_w1_a".format(x)
+        hada_w1_b_name = "{}.hada_w1_b".format(x)
+        hada_w2_a_name = "{}.hada_w2_a".format(x)
+        hada_w2_b_name = "{}.hada_w2_b".format(x)
+        hada_t1_name = "{}.hada_t1".format(x)
+        hada_t2_name = "{}.hada_t2".format(x)
+        if hada_w1_a_name in lora.keys():
+            hada_t1 = None
+            hada_t2 = None
+            if hada_t1_name in lora.keys():
+                hada_t1 = lora[hada_t1_name]
+                hada_t2 = lora[hada_t2_name]
+                loaded_keys.add(hada_t1_name)
+                loaded_keys.add(hada_t2_name)
+
+            patch_dict[to_load[x]] = ("loha", (lora[hada_w1_a_name], lora[hada_w1_b_name], alpha, lora[hada_w2_a_name], lora[hada_w2_b_name], hada_t1, hada_t2, dora_scale))
+            loaded_keys.add(hada_w1_a_name)
+            loaded_keys.add(hada_w1_b_name)
+            loaded_keys.add(hada_w2_a_name)
+            loaded_keys.add(hada_w2_b_name)
+
+
+        ######## lokr
+        lokr_w1_name = "{}.lokr_w1".format(x)
+        lokr_w2_name = "{}.lokr_w2".format(x)
+        lokr_w1_a_name = "{}.lokr_w1_a".format(x)
+        lokr_w1_b_name = "{}.lokr_w1_b".format(x)
+        lokr_t2_name = "{}.lokr_t2".format(x)
+        lokr_w2_a_name = "{}.lokr_w2_a".format(x)
+        lokr_w2_b_name = "{}.lokr_w2_b".format(x)
+
+        lokr_w1 = None
+        if lokr_w1_name in lora.keys():
+            lokr_w1 = lora[lokr_w1_name]
+            loaded_keys.add(lokr_w1_name)
+
+        lokr_w2 = None
+        if lokr_w2_name in lora.keys():
+            lokr_w2 = lora[lokr_w2_name]
+            loaded_keys.add(lokr_w2_name)
+
+        lokr_w1_a = None
+        if lokr_w1_a_name in lora.keys():
+            lokr_w1_a = lora[lokr_w1_a_name]
+            loaded_keys.add(lokr_w1_a_name)
+
+        lokr_w1_b = None
+        if lokr_w1_b_name in lora.keys():
+            lokr_w1_b = lora[lokr_w1_b_name]
+            loaded_keys.add(lokr_w1_b_name)
+
+        lokr_w2_a = None
+        if lokr_w2_a_name in lora.keys():
+            lokr_w2_a = lora[lokr_w2_a_name]
+            loaded_keys.add(lokr_w2_a_name)
+
+        lokr_w2_b = None
+        if lokr_w2_b_name in lora.keys():
+            lokr_w2_b = lora[lokr_w2_b_name]
+            loaded_keys.add(lokr_w2_b_name)
+
+        lokr_t2 = None
+        if lokr_t2_name in lora.keys():
+            lokr_t2 = lora[lokr_t2_name]
+            loaded_keys.add(lokr_t2_name)
+
+        if (lokr_w1 is not None) or (lokr_w2 is not None) or (lokr_w1_a is not None) or (lokr_w2_a is not None):
+            patch_dict[to_load[x]] = ("lokr", (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale))
+
+        #glora
+        a1_name = "{}.a1.weight".format(x)
+        a2_name = "{}.a2.weight".format(x)
+        b1_name = "{}.b1.weight".format(x)
+        b2_name = "{}.b2.weight".format(x)
+        if a1_name in lora:
+            patch_dict[to_load[x]] = ("glora", (lora[a1_name], lora[a2_name], lora[b1_name], lora[b2_name], alpha, dora_scale))
+            loaded_keys.add(a1_name)
+            loaded_keys.add(a2_name)
+            loaded_keys.add(b1_name)
+            loaded_keys.add(b2_name)

        w_norm_name = "{}.w_norm".format(x)
        b_norm_name = "{}.b_norm".format(x)
@@ -181,6 +307,7 @@ def model_lora_keys_unet(model, key_map={}):
            if k.endswith(".weight"):
                key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
                key_map["lora_unet_{}".format(key_lora)] = k
+                key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config
                key_map["{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
            else:
                key_map["{}".format(k)] = k #generic lora format for not .weight without any weird key names
@@ -200,13 +327,6 @@ def model_lora_keys_unet(model, key_map={}):
                    diffusers_lora_key = diffusers_lora_key[:-2]
                key_map[diffusers_lora_key] = unet_key

-    if isinstance(model, comfy.model_base.StableCascade_C):
-        for k in sdk:
-            if k.startswith("diffusion_model."):
-                if k.endswith(".weight"):
-                    key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
-                    key_map["lora_prior_unet_{}".format(key_lora)] = k
-
    if isinstance(model, comfy.model_base.SD3): #Diffusers lora SD3
        diffusers_keys = comfy.utils.mmdit_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
        for k in diffusers_keys:
@@ -279,16 +399,29 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["transformer.{}".format(key_lora)] = k
                key_map["diffusion_model.{}".format(key_lora)] = k  # Old loras

-    if isinstance(model, comfy.model_base.HiDream):
-        for k in sdk:
-            if k.startswith("diffusion_model."):
-                if k.endswith(".weight"):
-                    key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
-                    key_map["lycoris_{}".format(key_lora)] = k #SimpleTuner lycoris format
-
    return key_map


+def weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function):
+    dora_scale = comfy.model_management.cast_to_device(dora_scale, weight.device, intermediate_dtype)
+    lora_diff *= alpha
+    weight_calc = weight + function(lora_diff).type(weight.dtype)
+    weight_norm = (
+        weight_calc.transpose(0, 1)
+        .reshape(weight_calc.shape[1], -1)
+        .norm(dim=1, keepdim=True)
+        .reshape(weight_calc.shape[1], *[1] * (weight_calc.dim() - 1))
+        .transpose(0, 1)
+    )
+
+    weight_calc *= (dora_scale / weight_norm).type(weight.dtype)
+    if strength != 1.0:
+        weight_calc -= weight
+        weight += strength * (weight_calc)
+    else:
+        weight[:] = weight_calc
+    return weight
+
 def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Tensor:
    """
    Pad a tensor to a new shape with zeros.
@@ -343,16 +476,6 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
        if isinstance(v, list):
            v = (calculate_weight(v[1:], v[0][1](comfy.model_management.cast_to_device(v[0][0], weight.device, intermediate_dtype, copy=True), inplace=True), key, intermediate_dtype=intermediate_dtype), )

-        if isinstance(v, weight_adapter.WeightAdapterBase):
-            output = v.calculate_weight(weight, key, strength, strength_model, offset, function, intermediate_dtype, original_weights)
-            if output is None:
-                logging.warning("Calculate Weight Failed: {} {}".format(v.name, key))
-            else:
-                weight = output
-                if old_weight is not None:
-                    weight = old_weight
-            continue
-
        if len(v) == 1:
            patch_type = "diff"
        elif len(v) == 2:
@@ -379,6 +502,157 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
            diff_weight = comfy.model_management.cast_to_device(target_weight, weight.device, intermediate_dtype) - \
                          comfy.model_management.cast_to_device(original_weights[key][0][0], weight.device, intermediate_dtype)
            weight += function(strength * comfy.model_management.cast_to_device(diff_weight, weight.device, weight.dtype))
+        elif patch_type == "lora": #lora/locon
+            mat1 = comfy.model_management.cast_to_device(v[0], weight.device, intermediate_dtype)
+            mat2 = comfy.model_management.cast_to_device(v[1], weight.device, intermediate_dtype)
+            dora_scale = v[4]
+            reshape = v[5]
+
+            if reshape is not None:
+                weight = pad_tensor_to_shape(weight, reshape)
+
+            if v[2] is not None:
+                alpha = v[2] / mat2.shape[0]
+            else:
+                alpha = 1.0
+
+            if v[3] is not None:
+                #locon mid weights, hopefully the math is fine because I didn't properly test it
+                mat3 = comfy.model_management.cast_to_device(v[3], weight.device, intermediate_dtype)
+                final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]]
+                mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1)
+            try:
+                lora_diff = torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)).reshape(weight.shape)
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
+        elif patch_type == "lokr":
+            w1 = v[0]
+            w2 = v[1]
+            w1_a = v[3]
+            w1_b = v[4]
+            w2_a = v[5]
+            w2_b = v[6]
+            t2 = v[7]
+            dora_scale = v[8]
+            dim = None
+
+            if w1 is None:
+                dim = w1_b.shape[0]
+                w1 = torch.mm(comfy.model_management.cast_to_device(w1_a, weight.device, intermediate_dtype),
+                                comfy.model_management.cast_to_device(w1_b, weight.device, intermediate_dtype))
+            else:
+                w1 = comfy.model_management.cast_to_device(w1, weight.device, intermediate_dtype)
+
+            if w2 is None:
+                dim = w2_b.shape[0]
+                if t2 is None:
+                    w2 = torch.mm(comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype))
+                else:
+                    w2 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                        comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
+                                        comfy.model_management.cast_to_device(w2_b, weight.device, intermediate_dtype),
+                                        comfy.model_management.cast_to_device(w2_a, weight.device, intermediate_dtype))
+            else:
+                w2 = comfy.model_management.cast_to_device(w2, weight.device, intermediate_dtype)
+
+            if len(w2.shape) == 4:
+                w1 = w1.unsqueeze(2).unsqueeze(2)
+            if v[2] is not None and dim is not None:
+                alpha = v[2] / dim
+            else:
+                alpha = 1.0
+
+            try:
+                lora_diff = torch.kron(w1, w2).reshape(weight.shape)
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
+        elif patch_type == "loha":
+            w1a = v[0]
+            w1b = v[1]
+            if v[2] is not None:
+                alpha = v[2] / w1b.shape[0]
+            else:
+                alpha = 1.0
+
+            w2a = v[3]
+            w2b = v[4]
+            dora_scale = v[7]
+            if v[5] is not None: #cp decomposition
+                t1 = v[5]
+                t2 = v[6]
+                m1 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                    comfy.model_management.cast_to_device(t1, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype))
+
+                m2 = torch.einsum('i j k l, j r, i p -> p r k l',
+                                    comfy.model_management.cast_to_device(t2, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype),
+                                    comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype))
+            else:
+                m1 = torch.mm(comfy.model_management.cast_to_device(w1a, weight.device, intermediate_dtype),
+                                comfy.model_management.cast_to_device(w1b, weight.device, intermediate_dtype))
+                m2 = torch.mm(comfy.model_management.cast_to_device(w2a, weight.device, intermediate_dtype),
+                                comfy.model_management.cast_to_device(w2b, weight.device, intermediate_dtype))
+
+            try:
+                lora_diff = (m1 * m2).reshape(weight.shape)
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
+        elif patch_type == "glora":
+            dora_scale = v[5]
+
+            old_glora = False
+            if v[3].shape[1] == v[2].shape[0] == v[0].shape[0] == v[1].shape[1]:
+                rank = v[0].shape[0]
+                old_glora = True
+
+            if v[3].shape[0] == v[2].shape[1] == v[0].shape[1] == v[1].shape[0]:
+                if old_glora and v[1].shape[0] == weight.shape[0] and weight.shape[0] == weight.shape[1]:
+                    pass
+                else:
+                    old_glora = False
+                    rank = v[1].shape[0]
+
+            a1 = comfy.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, intermediate_dtype)
+            a2 = comfy.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, intermediate_dtype)
+            b1 = comfy.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, intermediate_dtype)
+            b2 = comfy.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, intermediate_dtype)
+
+            if v[4] is not None:
+                alpha = v[4] / rank
+            else:
+                alpha = 1.0
+
+            try:
+                if old_glora:
+                    lora_diff = (torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1).to(dtype=intermediate_dtype), a2), a1)).reshape(weight.shape) #old lycoris glora
+                else:
+                    if weight.dim() > 2:
+                        lora_diff = torch.einsum("o i ..., i j -> o j ...", torch.einsum("o i ..., i j -> o j ...", weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
+                    else:
+                        lora_diff = torch.mm(torch.mm(weight.to(dtype=intermediate_dtype), a1), a2).reshape(weight.shape)
+                    lora_diff += torch.mm(b1, b2).reshape(weight.shape)
+
+                if dora_scale is not None:
+                    weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function)
+                else:
+                    weight += function(((strength * alpha) * lora_diff).type(weight.dtype))
+            except Exception as e:
+                logging.error("ERROR {} {} {}".format(patch_type, key, e))
        else:
            logging.warning("patch type not recognized {} {}".format(patch_type, key))

--- a/comfy/lora_convert.py
+++ b/comfy/lora_convert.py
@@ -1,5 +1,4 @@
 import torch
-import comfy.utils


 def convert_lora_bfl_control(sd): #BFL loras for Flux
@@ -12,13 +11,7 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux
    return sd_out


-def convert_lora_wan_fun(sd): #Wan Fun loras
-    return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"})
-
-
 def convert_lora(sd):
    if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
        return convert_lora_bfl_control(sd)
-    if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd:
-        return convert_lora_wan_fun(sd)
    return sd
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -34,12 +34,6 @@ import comfy.ldm.flux.model
 import comfy.ldm.lightricks.model
 import comfy.ldm.hunyuan_video.model
 import comfy.ldm.cosmos.model
-import comfy.ldm.lumina.model
-import comfy.ldm.wan.model
-import comfy.ldm.hunyuan3d.model
-import comfy.ldm.hidream.model
-import comfy.ldm.chroma.model
-import comfy.ldm.ace.model

 import comfy.model_management
 import comfy.patcher_extension
@@ -62,7 +56,6 @@ class ModelType(Enum):
    FLOW = 6
    V_PREDICTION_CONTINUOUS = 7
    FLUX = 8
-    IMG_TO_IMG = 9


 from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
@@ -93,8 +86,6 @@ def model_sampling(model_config, model_type):
    elif model_type == ModelType.FLUX:
        c = comfy.model_sampling.CONST
        s = comfy.model_sampling.ModelSamplingFlux
-    elif model_type == ModelType.IMG_TO_IMG:
-        c = comfy.model_sampling.IMG_TO_IMG

    class ModelSampling(s, c):
        pass
@@ -115,7 +106,7 @@ class BaseModel(torch.nn.Module):

        if not unet_config.get("disable_unet_model_creation", False):
            if model_config.custom_operations is None:
-                fp8 = model_config.optimizations.get("fp8", False)
+                fp8 = model_config.optimizations.get("fp8", model_config.scaled_fp8 is not None)
                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
            else:
                operations = model_config.custom_operations
@@ -146,7 +137,6 @@ class BaseModel(torch.nn.Module):
    def _apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        sigma = t
        xc = self.model_sampling.calculate_input(sigma, x)
-
        if c_concat is not None:
            xc = torch.cat([xc] + [c_concat], dim=1)

@@ -158,9 +148,7 @@ class BaseModel(torch.nn.Module):

        xc = xc.to(dtype)
        t = self.model_sampling.timestep(t).float()
-        if context is not None:
-            context = context.to(dtype)
-
+        context = context.to(dtype)
        extra_conds = {}
        for o in kwargs:
            extra = kwargs[o]
@@ -169,16 +157,15 @@ class BaseModel(torch.nn.Module):
                    extra = extra.to(dtype)
            extra_conds[o] = extra

-        t = self.process_timestep(t, x=x, **extra_conds)
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
        return self.model_sampling.calculate_denoised(sigma, model_output, x)

-    def process_timestep(self, timestep, **kwargs):
-        return timestep
-
    def get_dtype(self):
        return self.diffusion_model.dtype

+    def is_adm(self):
+        return self.adm_channels > 0
+
    def encode_adm(self, **kwargs):
        return None

@@ -197,11 +184,6 @@ class BaseModel(torch.nn.Module):

            if concat_latent_image.shape[1:] != noise.shape[1:]:
                concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
-                if noise.ndim == 5:
-                    if concat_latent_image.shape[-3] < noise.shape[-3]:
-                        concat_latent_image = torch.nn.functional.pad(concat_latent_image, (0, 0, 0, 0, 0, noise.shape[-3] - concat_latent_image.shape[-3]), "constant", 0)
-                    else:
-                        concat_latent_image = concat_latent_image[:, :, :noise.shape[-3]]

            concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])

@@ -230,11 +212,6 @@ class BaseModel(torch.nn.Module):
                        cond_concat.append(self.blank_inpaint_image_like(noise))
                    elif ck == "mask_inverted":
                        cond_concat.append(torch.zeros_like(noise)[:, :1])
-                if ck == "concat_image":
-                    if concat_latent_image is not None:
-                        cond_concat.append(concat_latent_image.to(device))
-                    else:
-                        cond_concat.append(torch.zeros_like(noise))
            data = torch.cat(cond_concat, dim=1)
            return data
        return None
@@ -572,10 +549,6 @@ class SD_X4Upscaler(BaseModel):

        out['c_concat'] = comfy.conds.CONDNoiseShape(image)
        out['y'] = comfy.conds.CONDRegular(noise_level)
-
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
        return out

 class IP2P:
@@ -608,19 +581,6 @@ class SDXL_instructpix2pix(IP2P, SDXL):
        else:
            self.process_ip2p_image_in = lambda image: image #diffusers ip2p

-class Lotus(BaseModel):
-    def extra_conds(self, **kwargs):
-        out = {}
-        cross_attn = kwargs.get("cross_attn", None)
-        out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
-        device = kwargs["device"]
-        task_emb = torch.tensor([1, 0]).float().to(device)
-        task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)]).unsqueeze(0)
-        out['y'] = comfy.conds.CONDRegular(task_emb)
-        return out
-
-    def __init__(self, model_config, model_type=ModelType.IMG_TO_IMG, device=None):
-        super().__init__(model_config, model_type, device=device)

 class StableCascade_C(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
@@ -788,8 +748,8 @@ class PixArt(BaseModel):
        return out

 class Flux(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.flux.model.Flux):
-        super().__init__(model_config, model_type, device=device, unet_model=unet_model)
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)

    def concat_cond(self, **kwargs):
        try:
@@ -846,10 +806,7 @@ class Flux(BaseModel):
            (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
            attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
-
-        guidance = kwargs.get("guidance", 3.5)
-        if guidance is not None:
-            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
+        out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([kwargs.get("guidance", 3.5)]))
        return out

 class GenmoMochi(BaseModel):
@@ -880,26 +837,17 @@ class LTXV(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)

+        guiding_latent = kwargs.get("guiding_latent", None)
+        if guiding_latent is not None:
+            out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
+
+        guiding_latent_noise_scale = kwargs.get("guiding_latent_noise_scale", None)
+        if guiding_latent_noise_scale is not None:
+            out["guiding_latent_noise_scale"] = comfy.conds.CONDConstant(guiding_latent_noise_scale)
+
        out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
-
-        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
-        if denoise_mask is not None:
-            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
-
-        keyframe_idxs = kwargs.get("keyframe_idxs", None)
-        if keyframe_idxs is not None:
-            out['keyframe_idxs'] = comfy.conds.CONDRegular(keyframe_idxs)
-
        return out

-    def process_timestep(self, timestep, x, denoise_mask=None, **kwargs):
-        if denoise_mask is None:
-            return timestep
-        return self.diffusion_model.patchifier.patchify(((denoise_mask) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1)))[:, :1])[0]
-
-    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
-        return latent_image
-
 class HunyuanVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo)
@@ -915,36 +863,9 @@ class HunyuanVideo(BaseModel):
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-
-        guidance = kwargs.get("guidance", 6.0)
-        if guidance is not None:
-            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
-
-        guiding_frame_index = kwargs.get("guiding_frame_index", None)
-        if guiding_frame_index is not None:
-            out['guiding_frame_index'] = comfy.conds.CONDRegular(torch.FloatTensor([guiding_frame_index]))
-
+        out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([kwargs.get("guidance", 6.0)]))
        return out

-    def scale_latent_inpaint(self, latent_image, **kwargs):
-        return latent_image
-
-class HunyuanVideoI2V(HunyuanVideo):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device)
-        self.concat_keys = ("concat_image", "mask_inverted")
-
-    def scale_latent_inpaint(self, latent_image, **kwargs):
-        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
-
-class HunyuanVideoSkyreelsI2V(HunyuanVideo):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device)
-        self.concat_keys = ("concat_image",)
-
-    def scale_latent_inpaint(self, latent_image, **kwargs):
-        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
-
 class CosmosVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EDM, image_to_video=False, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cosmos.model.GeneralDIT)
@@ -971,173 +892,3 @@ class CosmosVideo(BaseModel):
            latent_image = latent_image + noise
        latent_image = self.model_sampling.calculate_input(torch.tensor([sigma_noise_augmentation], device=latent_image.device, dtype=latent_image.dtype), latent_image)
        return latent_image * ((sigma ** 2 + self.model_sampling.sigma_data ** 2) ** 0.5)
-
-class Lumina2(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiT)
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        attention_mask = kwargs.get("attention_mask", None)
-        if attention_mask is not None:
-            if torch.numel(attention_mask) != attention_mask.sum():
-                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
-            out['num_tokens'] = comfy.conds.CONDConstant(max(1, torch.sum(attention_mask).item()))
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-        return out
-
-class WAN21(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
-        self.image_to_video = image_to_video
-
-    def concat_cond(self, **kwargs):
-        noise = kwargs.get("noise", None)
-        extra_channels = self.diffusion_model.patch_embedding.weight.shape[1] - noise.shape[1]
-        if extra_channels == 0:
-            return None
-
-        image = kwargs.get("concat_latent_image", None)
-        device = kwargs["device"]
-
-        if image is None:
-            shape_image = list(noise.shape)
-            shape_image[1] = extra_channels
-            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
-        else:
-            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
-            for i in range(0, image.shape[1], 16):
-                image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16])
-            image = utils.resize_to_batch_size(image, noise.shape[0])
-
-        if not self.image_to_video or extra_channels == image.shape[1]:
-            return image
-
-        if image.shape[1] > (extra_channels - 4):
-            image = image[:, :(extra_channels - 4)]
-
-        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
-        if mask is None:
-            mask = torch.zeros_like(noise)[:, :4]
-        else:
-            if mask.shape[1] != 4:
-                mask = torch.mean(mask, dim=1, keepdim=True)
-            mask = 1.0 - mask
-            mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
-            if mask.shape[-3] < noise.shape[-3]:
-                mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
-            if mask.shape[1] == 1:
-                mask = mask.repeat(1, 4, 1, 1, 1)
-            mask = utils.resize_to_batch_size(mask, noise.shape[0])
-
-        return torch.cat((mask, image), dim=1)
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-
-        clip_vision_output = kwargs.get("clip_vision_output", None)
-        if clip_vision_output is not None:
-            out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.penultimate_hidden_states)
-        return out
-
-
-class WAN21_Vace(WAN21):
-    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
-        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.VaceWanModel)
-        self.image_to_video = image_to_video
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        noise = kwargs.get("noise", None)
-        noise_shape = list(noise.shape)
-        vace_frames = kwargs.get("vace_frames", None)
-        if vace_frames is None:
-            noise_shape[1] = 32
-            vace_frames = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype)
-
-        for i in range(0, vace_frames.shape[1], 16):
-            vace_frames = vace_frames.clone()
-            vace_frames[:, i:i + 16] = self.process_latent_in(vace_frames[:, i:i + 16])
-
-        mask = kwargs.get("vace_mask", None)
-        if mask is None:
-            noise_shape[1] = 64
-            mask = torch.ones(noise_shape, device=noise.device, dtype=noise.dtype)
-
-        out['vace_context'] = comfy.conds.CONDRegular(torch.cat([vace_frames.to(noise), mask.to(noise)], dim=1))
-
-        vace_strength = kwargs.get("vace_strength", 1.0)
-        out['vace_strength'] = comfy.conds.CONDConstant(vace_strength)
-        return out
-
-
-class Hunyuan3Dv2(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-
-        guidance = kwargs.get("guidance", 5.0)
-        if guidance is not None:
-            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
-        return out
-
-class HiDream(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream.model.HiDreamImageTransformer2DModel)
-
-    def encode_adm(self, **kwargs):
-        return kwargs["pooled_output"]
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-        conditioning_llama3 = kwargs.get("conditioning_llama3", None)
-        if conditioning_llama3 is not None:
-            out['encoder_hidden_states_llama3'] = comfy.conds.CONDRegular(conditioning_llama3)
-        image_cond = kwargs.get("concat_latent_image", None)
-        if image_cond is not None:
-            out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond))
-        return out
-
-class Chroma(Flux):
-    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma)
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-
-        guidance = kwargs.get("guidance", 0)
-        if guidance is not None:
-            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
-        return out
-
-class ACEStep(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ace.model.ACEStepTransformer2DModel)
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        noise = kwargs.get("noise", None)
-
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-
-        conditioning_lyrics = kwargs.get("conditioning_lyrics", None)
-        if cross_attn is not None:
-            out['lyric_token_idx'] = comfy.conds.CONDRegular(conditioning_lyrics)
-        out['speaker_embeds'] = comfy.conds.CONDRegular(torch.zeros(noise.shape[0], 512, device=noise.device, dtype=noise.dtype))
-        out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
-        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -1,4 +1,3 @@
-import json
 import comfy.supported_models
 import comfy.supported_models_base
 import comfy.utils
@@ -34,7 +33,7 @@ def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
        return last_transformer_depth, context_dim, use_linear_in_transformer, time_stack, time_stack_cross
    return None

-def detect_unet_config(state_dict, key_prefix, metadata=None):
+def detect_unet_config(state_dict, key_prefix):
    state_dict_keys = list(state_dict.keys())

    if '{}joint_blocks.0.context_block.attn.qkv.weight'.format(key_prefix) in state_dict_keys: #mmdit model
@@ -137,7 +136,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
    if '{}txt_in.individual_token_refiner.blocks.0.norm1.weight'.format(key_prefix) in state_dict_keys: #Hunyuan Video
        dit_config = {}
        dit_config["image_model"] = "hunyuan_video"
-        dit_config["in_channels"] = state_dict['{}img_in.proj.weight'.format(key_prefix)].shape[1] #SkyReels img2video has 32 input channels
+        dit_config["in_channels"] = 16
        dit_config["patch_size"] = [1, 2, 2]
        dit_config["out_channels"] = 16
        dit_config["vec_in_dim"] = 768
@@ -154,7 +153,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["guidance_embed"] = len(guidance_keys) > 0
        return dit_config

-    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and '{}img_in.weight'.format(key_prefix) in state_dict_keys: #Flux
+    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys: #Flux
        dit_config = {}
        dit_config["image_model"] = "flux"
        dit_config["in_channels"] = 16
@@ -164,9 +163,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        if in_key in state_dict_keys:
            dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size)
        dit_config["out_channels"] = 16
-        vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix)
-        if vec_in_key in state_dict_keys:
-            dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1]
+        dit_config["vec_in_dim"] = 768
        dit_config["context_in_dim"] = 4096
        dit_config["hidden_size"] = 3072
        dit_config["mlp_ratio"] = 4.0
@@ -176,16 +173,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["axes_dim"] = [16, 56, 56]
        dit_config["theta"] = 10000
        dit_config["qkv_bias"] = True
-        if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma
-            dit_config["image_model"] = "chroma"
-            dit_config["in_channels"] = 64
-            dit_config["out_channels"] = 64
-            dit_config["in_dim"] = 64
-            dit_config["out_dim"] = 3072
-            dit_config["hidden_dim"] = 5120
-            dit_config["n_layers"] = 5
-        else:
-            dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
+        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
        return dit_config

    if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview
@@ -222,37 +210,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
    if '{}adaln_single.emb.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: #Lightricks ltxv
        dit_config = {}
        dit_config["image_model"] = "ltxv"
-        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
-        shape = state_dict['{}transformer_blocks.0.attn2.to_k.weight'.format(key_prefix)].shape
-        dit_config["attention_head_dim"] = shape[0] // 32
-        dit_config["cross_attention_dim"] = shape[1]
-        if metadata is not None and "config" in metadata:
-            dit_config.update(json.loads(metadata["config"]).get("transformer", {}))
-        return dit_config
-
-    if '{}genre_embedder.weight'.format(key_prefix) in state_dict_keys: #ACE-Step model
-        dit_config = {}
-        dit_config["audio_model"] = "ace"
-        dit_config["attention_head_dim"] = 128
-        dit_config["in_channels"] = 8
-        dit_config["inner_dim"] = 2560
-        dit_config["max_height"] = 16
-        dit_config["max_position"] = 32768
-        dit_config["max_width"] = 32768
-        dit_config["mlp_ratio"] = 2.5
-        dit_config["num_attention_heads"] = 20
-        dit_config["num_layers"] = 24
-        dit_config["out_channels"] = 8
-        dit_config["patch_size"] = [16, 1]
-        dit_config["rope_theta"] = 1000000.0
-        dit_config["speaker_embedding_dim"] = 512
-        dit_config["text_embedding_dim"] = 768
-
-        dit_config["ssl_encoder_depths"] = [8, 8]
-        dit_config["ssl_latent_dims"] = [1024, 768]
-        dit_config["ssl_names"] = ["mert", "m-hubert"]
-        dit_config["lyric_encoder_vocab_size"] = 6693
-        dit_config["lyric_hidden_size"] = 1024
        return dit_config

    if '{}t_block.1.weight'.format(key_prefix) in state_dict_keys: # PixArt
@@ -282,7 +239,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["micro_condition"] = False
        return dit_config

-    if '{}blocks.block0.blocks.0.block.attn.to_q.0.weight'.format(key_prefix) in state_dict_keys:  # Cosmos
+    if '{}blocks.block0.blocks.0.block.attn.to_q.0.weight'.format(key_prefix) in state_dict_keys:
        dit_config = {}
        dit_config["image_model"] = "cosmos"
        dit_config["max_img_h"] = 240
@@ -327,84 +284,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["extra_per_block_abs_pos_emb_type"] = "learnable"
        return dit_config

-    if '{}cap_embedder.1.weight'.format(key_prefix) in state_dict_keys:  # Lumina 2
-        dit_config = {}
-        dit_config["image_model"] = "lumina2"
-        dit_config["patch_size"] = 2
-        dit_config["in_channels"] = 16
-        dit_config["dim"] = 2304
-        dit_config["cap_feat_dim"] = 2304
-        dit_config["n_layers"] = 26
-        dit_config["n_heads"] = 24
-        dit_config["n_kv_heads"] = 8
-        dit_config["qk_norm"] = True
-        dit_config["axes_dims"] = [32, 32, 32]
-        dit_config["axes_lens"] = [300, 512, 512]
-        return dit_config
-
-    if '{}head.modulation'.format(key_prefix) in state_dict_keys:  # Wan 2.1
-        dit_config = {}
-        dit_config["image_model"] = "wan2.1"
-        dim = state_dict['{}head.modulation'.format(key_prefix)].shape[-1]
-        dit_config["dim"] = dim
-        dit_config["num_heads"] = dim // 128
-        dit_config["ffn_dim"] = state_dict['{}blocks.0.ffn.0.weight'.format(key_prefix)].shape[0]
-        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
-        dit_config["patch_size"] = (1, 2, 2)
-        dit_config["freq_dim"] = 256
-        dit_config["window_size"] = (-1, -1)
-        dit_config["qk_norm"] = True
-        dit_config["cross_attn_norm"] = True
-        dit_config["eps"] = 1e-6
-        dit_config["in_dim"] = state_dict['{}patch_embedding.weight'.format(key_prefix)].shape[1]
-        if '{}vace_patch_embedding.weight'.format(key_prefix) in state_dict_keys:
-            dit_config["model_type"] = "vace"
-            dit_config["vace_in_dim"] = state_dict['{}vace_patch_embedding.weight'.format(key_prefix)].shape[1]
-            dit_config["vace_layers"] = count_blocks(state_dict_keys, '{}vace_blocks.'.format(key_prefix) + '{}.')
-        else:
-            if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
-                dit_config["model_type"] = "i2v"
-            else:
-                dit_config["model_type"] = "t2v"
-        flf_weight = state_dict.get('{}img_emb.emb_pos'.format(key_prefix))
-        if flf_weight is not None:
-            dit_config["flf_pos_embed_token_number"] = flf_weight.shape[1]
-        return dit_config
-
-    if '{}latent_in.weight'.format(key_prefix) in state_dict_keys:  # Hunyuan 3D
-        in_shape = state_dict['{}latent_in.weight'.format(key_prefix)].shape
-        dit_config = {}
-        dit_config["image_model"] = "hunyuan3d2"
-        dit_config["in_channels"] = in_shape[1]
-        dit_config["context_in_dim"] = state_dict['{}cond_in.weight'.format(key_prefix)].shape[1]
-        dit_config["hidden_size"] = in_shape[0]
-        dit_config["mlp_ratio"] = 4.0
-        dit_config["num_heads"] = 16
-        dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
-        dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
-        dit_config["qkv_bias"] = True
-        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
-        return dit_config
-
-    if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys:  # HiDream
-        dit_config = {}
-        dit_config["image_model"] = "hidream"
-        dit_config["attention_head_dim"] = 128
-        dit_config["axes_dims_rope"] = [64, 32, 32]
-        dit_config["caption_channels"] = [4096, 4096]
-        dit_config["max_resolution"] = [128, 128]
-        dit_config["in_channels"] = 16
-        dit_config["llama_layers"] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31]
-        dit_config["num_attention_heads"] = 20
-        dit_config["num_routed_experts"] = 4
-        dit_config["num_activated_experts"] = 2
-        dit_config["num_layers"] = 16
-        dit_config["num_single_layers"] = 32
-        dit_config["out_channels"] = 16
-        dit_config["patch_size"] = 2
-        dit_config["text_emb_dim"] = 2048
-        return dit_config
-
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None

@@ -539,8 +418,8 @@ def model_config_from_unet_config(unet_config, state_dict=None):
    logging.error("no match {}".format(unet_config))
    return None

-def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False, metadata=None):
-    unet_config = detect_unet_config(state_dict, unet_key_prefix, metadata=metadata)
+def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False):
+    unet_config = detect_unet_config(state_dict, unet_key_prefix)
    if unet_config is None:
        return None
    model_config = model_config_from_unet_config(unet_config, state_dict)
@@ -553,10 +432,6 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
        model_config.scaled_fp8 = scaled_fp8_weight.dtype
        if model_config.scaled_fp8 == torch.float32:
            model_config.scaled_fp8 = torch.float8_e4m3fn
-        if scaled_fp8_weight.nelement() == 2:
-            model_config.optimizations["fp8"] = False
-        else:
-            model_config.optimizations["fp8"] = True

    return model_config

@@ -749,13 +624,8 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
            'use_temporal_attention': False, 'use_temporal_resblock': False}

-    LotusD = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': 4,
-            'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
-            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, 'num_heads': 8,
-            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
-            'use_temporal_attention': False, 'use_temporal_resblock': False}

-    supported_models = [LotusD, SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p, SD15_diffusers_inpaint]
+    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B, SD09_XS, SD_XS, SDXL_diffusers_ip2p, SD15_diffusers_inpaint]

    for unet_config in supported_models:
        matches = True
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -19,7 +19,7 @@
 import psutil
 import logging
 from enum import Enum
-from comfy.cli_args import args, PerformanceFeature
+from comfy.cli_args import args
 import torch
 import sys
 import platform
@@ -46,39 +46,11 @@ cpu_state = CPUState.GPU

 total_vram = 0

-def get_supported_float8_types():
-    float8_types = []
-    try:
-        float8_types.append(torch.float8_e4m3fn)
-    except:
-        pass
-    try:
-        float8_types.append(torch.float8_e4m3fnuz)
-    except:
-        pass
-    try:
-        float8_types.append(torch.float8_e5m2)
-    except:
-        pass
-    try:
-        float8_types.append(torch.float8_e5m2fnuz)
-    except:
-        pass
-    try:
-        float8_types.append(torch.float8_e8m0fnu)
-    except:
-        pass
-    return float8_types
-
-FLOAT8_TYPES = get_supported_float8_types()
-
 xpu_available = False
 torch_version = ""
 try:
    torch_version = torch.version.__version__
-    temp = torch_version.split(".")
-    torch_version_numeric = (int(temp[0]), int(temp[1]))
-    xpu_available = (torch_version_numeric[0] < 2 or (torch_version_numeric[0] == 2 and torch_version_numeric[1] <= 4)) and torch.xpu.is_available()
+    xpu_available = (int(torch_version[0]) < 2 or (int(torch_version[0]) == 2 and int(torch_version[2]) <= 4)) and torch.xpu.is_available()
 except:
    pass

@@ -121,13 +93,6 @@ try:
 except:
    npu_available = False

-try:
-    import torch_mlu  # noqa: F401
-    _ = torch.mlu.device_count()
-    mlu_available = torch.mlu.is_available()
-except:
-    mlu_available = False
-
 if args.cpu:
    cpu_state = CPUState.CPU

@@ -145,12 +110,6 @@ def is_ascend_npu():
        return True
    return False

-def is_mlu():
-    global mlu_available
-    if mlu_available:
-        return True
-    return False
-
 def get_torch_device():
    global directml_enabled
    global cpu_state
@@ -166,8 +125,6 @@ def get_torch_device():
            return torch.device("xpu", torch.xpu.current_device())
        elif is_ascend_npu():
            return torch.device("npu", torch.npu.current_device())
-        elif is_mlu():
-            return torch.device("mlu", torch.mlu.current_device())
        else:
            return torch.device(torch.cuda.current_device())

@@ -194,12 +151,6 @@ def get_total_memory(dev=None, torch_total_too=False):
            _, mem_total_npu = torch.npu.mem_get_info(dev)
            mem_total_torch = mem_reserved
            mem_total = mem_total_npu
-        elif is_mlu():
-            stats = torch.mlu.memory_stats(dev)
-            mem_reserved = stats['reserved_bytes.all.current']
-            _, mem_total_mlu = torch.mlu.mem_get_info(dev)
-            mem_total_torch = mem_reserved
-            mem_total = mem_total_mlu
        else:
            stats = torch.cuda.memory_stats(dev)
            mem_reserved = stats['reserved_bytes.all.current']
@@ -212,21 +163,12 @@ def get_total_memory(dev=None, torch_total_too=False):
    else:
        return mem_total

-def mac_version():
-    try:
-        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
-    except:
-        return None
-
 total_vram = get_total_memory(get_torch_device()) / (1024 * 1024)
 total_ram = psutil.virtual_memory().total / (1024 * 1024)
 logging.info("Total VRAM {:0.0f} MB, total RAM {:0.0f} MB".format(total_vram, total_ram))

 try:
    logging.info("pytorch version: {}".format(torch_version))
-    mac_ver = mac_version()
-    if mac_ver is not None:
-        logging.info("Mac Version {}".format(mac_ver))
 except:
    pass

@@ -276,7 +218,7 @@ def is_amd():

 MIN_WEIGHT_MEMORY_RATIO = 0.4
 if is_nvidia():
-    MIN_WEIGHT_MEMORY_RATIO = 0.0
+    MIN_WEIGHT_MEMORY_RATIO = 0.2

 ENABLE_PYTORCH_ATTENTION = False
 if args.use_pytorch_cross_attention:
@@ -285,45 +227,22 @@ if args.use_pytorch_cross_attention:

 try:
    if is_nvidia():
-        if torch_version_numeric[0] >= 2:
+        if int(torch_version[0]) >= 2:
            if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
                ENABLE_PYTORCH_ATTENTION = True
-    if is_intel_xpu() or is_ascend_npu() or is_mlu():
+    if is_intel_xpu() or is_ascend_npu():
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
            ENABLE_PYTORCH_ATTENTION = True
 except:
    pass

-
-try:
-    if is_amd():
-        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
-        logging.info("AMD arch: {}".format(arch))
-        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
-            if torch_version_numeric[0] >= 2 and torch_version_numeric[1] >= 7:  # works on 2.6 but doesn't actually seem to improve much
-                if any((a in arch) for a in ["gfx1100", "gfx1101"]):  # TODO: more arches
-                    ENABLE_PYTORCH_ATTENTION = True
-except:
-    pass
-
-
 if ENABLE_PYTORCH_ATTENTION:
    torch.backends.cuda.enable_math_sdp(True)
    torch.backends.cuda.enable_flash_sdp(True)
    torch.backends.cuda.enable_mem_efficient_sdp(True)

-
-PRIORITIZE_FP16 = False  # TODO: remove and replace with something that shows exactly which dtype is faster than the other
 try:
-    if is_nvidia() and PerformanceFeature.Fp16Accumulation in args.fast:
-        torch.backends.cuda.matmul.allow_fp16_accumulation = True
-        PRIORITIZE_FP16 = True  # TODO: limit to cards where it actually boosts performance
-        logging.info("Enabled fp16 accumulation.")
-except:
-    pass
-
-try:
-    if torch_version_numeric[0] == 2 and torch_version_numeric[1] >= 5:
+    if int(torch_version[0]) == 2 and int(torch_version[2]) >= 5:
        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
 except:
    logging.warning("Warning, could not set allow_fp16_bf16_reduction_math_sdp")
@@ -337,10 +256,15 @@ elif args.highvram or args.gpu_only:
    vram_state = VRAMState.HIGH_VRAM

 FORCE_FP32 = False
+FORCE_FP16 = False
 if args.force_fp32:
    logging.info("Forcing FP32, if this improves things please report it.")
    FORCE_FP32 = True

+if args.force_fp16:
+    logging.info("Forcing FP16.")
+    FORCE_FP16 = True
+
 if lowvram_available:
    if set_vram_to in (VRAMState.LOW_VRAM, VRAMState.NO_VRAM):
        vram_state = set_vram_to
@@ -373,8 +297,6 @@ def get_torch_device_name(device):
        return "{} {}".format(device, torch.xpu.get_device_name(device))
    elif is_ascend_npu():
        return "{} {}".format(device, torch.npu.get_device_name(device))
-    elif is_mlu():
-        return "{} {}".format(device, torch.mlu.get_device_name(device))
    else:
        return "CUDA {}: {}".format(device, torch.cuda.get_device_name(device))

@@ -613,11 +535,14 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
            vram_set_state = vram_state
        lowvram_model_memory = 0
        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM) and not force_full_load:
+            model_size = loaded_model.model_memory_required(torch_dev)
            loaded_memory = loaded_model.model_loaded_memory()
            current_free_mem = get_free_memory(torch_dev) + loaded_memory

-            lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
+            lowvram_model_memory = max(64 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
            lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory)
+            if model_size <= lowvram_model_memory: #only switch to lowvram if really necessary
+                lowvram_model_memory = 0

        if vram_set_state == VRAMState.NO_VRAM:
            lowvram_model_memory = 0.1
@@ -710,7 +635,7 @@ def unet_inital_load_device(parameters, dtype):
 def maximum_vram_for_weights(device=None):
    return (get_total_memory(device) * 0.88 - minimum_inference_memory())

-def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32], weight_dtype=None):
+def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
    if model_params < 0:
        model_params = 1000000000000000000000
    if args.fp32_unet:
@@ -725,12 +650,15 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
        return torch.float8_e4m3fn
    if args.fp8_e5m2_unet:
        return torch.float8_e5m2
-    if args.fp8_e8m0fnu_unet:
-        return torch.float8_e8m0fnu

    fp8_dtype = None
-    if weight_dtype in FLOAT8_TYPES:
-        fp8_dtype = weight_dtype
+    try:
+        for dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+            if dtype in supported_dtypes:
+                fp8_dtype = dtype
+                break
+    except:
+        pass

    if fp8_dtype is not None:
        if supports_fp8_compute(device): #if fp8 compute is supported the casting is most likely not expensive
@@ -740,10 +668,6 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
        if model_params * 2 > free_model_memory:
            return fp8_dtype

-    if PRIORITIZE_FP16 or weight_dtype == torch.float16:
-        if torch.float16 in supported_dtypes and should_use_fp16(device=device, model_params=model_params):
-            return torch.float16
-
    for dt in supported_dtypes:
        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params):
            if torch.float16 in supported_dtypes:
@@ -776,9 +700,6 @@ def unet_manual_cast(weight_dtype, inference_device, supported_dtypes=[torch.flo
        return None

    fp16_supported = should_use_fp16(inference_device, prioritize_performance=True)
-    if PRIORITIZE_FP16 and fp16_supported and torch.float16 in supported_dtypes:
-        return torch.float16
-
    for dt in supported_dtypes:
        if dt == torch.float16 and fp16_supported:
            return torch.float16
@@ -825,8 +746,6 @@ def text_encoder_dtype(device=None):
        return torch.float8_e5m2
    elif args.fp16_text_enc:
        return torch.float16
-    elif args.bf16_text_enc:
-        return torch.bfloat16
    elif args.fp32_text_enc:
        return torch.float32

@@ -939,61 +858,15 @@ def force_channels_last():
    #TODO
    return False

-
-STREAMS = {}
-NUM_STREAMS = 1
-if args.async_offload:
-    NUM_STREAMS = 2
-    logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS))
-
-stream_counters = {}
-def get_offload_stream(device):
-    stream_counter = stream_counters.get(device, 0)
-    if NUM_STREAMS <= 1:
-        return None
-
-    if device in STREAMS:
-        ss = STREAMS[device]
-        s = ss[stream_counter]
-        stream_counter = (stream_counter + 1) % len(ss)
-        if is_device_cuda(device):
-            ss[stream_counter].wait_stream(torch.cuda.current_stream())
-        stream_counters[device] = stream_counter
-        return s
-    elif is_device_cuda(device):
-        ss = []
-        for k in range(NUM_STREAMS):
-            ss.append(torch.cuda.Stream(device=device, priority=0))
-        STREAMS[device] = ss
-        s = ss[stream_counter]
-        stream_counter = (stream_counter + 1) % len(ss)
-        stream_counters[device] = stream_counter
-        return s
-    return None
-
-def sync_stream(device, stream):
-    if stream is None:
-        return
-    if is_device_cuda(device):
-        torch.cuda.current_stream().wait_stream(stream)
-
-def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
+def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False):
    if device is None or weight.device == device:
        if not copy:
            if dtype is None or weight.dtype == dtype:
                return weight
-        if stream is not None:
-            with stream:
-                return weight.to(dtype=dtype, copy=copy)
        return weight.to(dtype=dtype, copy=copy)

-    if stream is not None:
-        with stream:
-            r = torch.empty_like(weight, dtype=dtype, device=device)
-            r.copy_(weight, non_blocking=non_blocking)
-    else:
-        r = torch.empty_like(weight, dtype=dtype, device=device)
-        r.copy_(weight, non_blocking=non_blocking)
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight, non_blocking=non_blocking)
    return r

 def cast_to_device(tensor, device, dtype, copy=False):
@@ -1003,9 +876,6 @@ def cast_to_device(tensor, device, dtype, copy=False):
 def sage_attention_enabled():
    return args.use_sage_attention

-def flash_attention_enabled():
-    return args.use_flash_attention
-
 def xformers_enabled():
    global directml_enabled
    global cpu_state
@@ -1015,8 +885,6 @@ def xformers_enabled():
        return False
    if is_ascend_npu():
        return False
-    if is_mlu():
-        return False
    if directml_enabled:
        return False
    return XFORMERS_IS_AVAILABLE
@@ -1033,11 +901,6 @@ def pytorch_attention_enabled():
    global ENABLE_PYTORCH_ATTENTION
    return ENABLE_PYTORCH_ATTENTION

-def pytorch_attention_enabled_vae():
-    if is_amd():
-        return False  # enabling pytorch attention on AMD currently causes crash when doing high res
-    return pytorch_attention_enabled()
-
 def pytorch_attention_flash_attention():
    global ENABLE_PYTORCH_ATTENTION
    if ENABLE_PYTORCH_ATTENTION:
@@ -1048,21 +911,23 @@ def pytorch_attention_flash_attention():
            return True
        if is_ascend_npu():
            return True
-        if is_mlu():
-            return True
-        if is_amd():
-            return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention
    return False

+def mac_version():
+    try:
+        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
+    except:
+        return None
+
 def force_upcast_attention_dtype():
    upcast = args.force_upcast_attention

    macos_version = mac_version()
-    if macos_version is not None and ((14, 5) <= macos_version < (16,)):  # black image bug on recent versions of macOS
+    if macos_version is not None and ((14, 5) <= macos_version <= (15, 2)):  # black image bug on recent versions of macOS
        upcast = True

    if upcast:
-        return {torch.float16: torch.float32}
+        return torch.float32
    else:
        return None

@@ -1092,13 +957,6 @@ def get_free_memory(dev=None, torch_free_too=False):
            mem_free_npu, _ = torch.npu.mem_get_info(dev)
            mem_free_torch = mem_reserved - mem_active
            mem_free_total = mem_free_npu + mem_free_torch
-        elif is_mlu():
-            stats = torch.mlu.memory_stats(dev)
-            mem_active = stats['active_bytes.all.current']
-            mem_reserved = stats['reserved_bytes.all.current']
-            mem_free_mlu, _ = torch.mlu.mem_get_info(dev)
-            mem_free_torch = mem_reserved - mem_active
-            mem_free_total = mem_free_mlu + mem_free_torch
        else:
            stats = torch.cuda.memory_stats(dev)
            mem_active = stats['active_bytes.all.current']
@@ -1135,26 +993,21 @@ def is_device_mps(device):
 def is_device_cuda(device):
    return is_device_type(device, 'cuda')

-def is_directml_enabled():
-    global directml_enabled
-    if directml_enabled:
-        return True
-
-    return False
-
 def should_use_fp16(device=None, model_params=0, prioritize_performance=True, manual_cast=False):
+    global directml_enabled
+
    if device is not None:
        if is_device_cpu(device):
            return False

-    if args.force_fp16:
+    if FORCE_FP16:
        return True

    if FORCE_FP32:
        return False

-    if is_directml_enabled():
-        return True
+    if directml_enabled:
+        return False

    if (device is not None and is_device_mps(device)) or mps_mode():
        return True
@@ -1168,9 +1021,6 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
    if is_ascend_npu():
        return True

-    if is_mlu():
-        return True
-
    if torch.version.hip:
        return True

@@ -1228,28 +1078,13 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    if is_intel_xpu():
        return True

-    if is_ascend_npu():
-        return True
-
-    if is_amd():
-        arch = torch.cuda.get_device_properties(device).gcnArchName
-        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
-            if manual_cast:
-                return True
-            return False
-
    props = torch.cuda.get_device_properties(device)
-
-    if is_mlu():
-        if props.major > 3:
-            return True
-
    if props.major >= 8:
        return True

    bf16_works = torch.cuda.is_bf16_supported()

-    if bf16_works and manual_cast:
+    if bf16_works or manual_cast:
        free_model_memory = maximum_vram_for_weights(device)
        if (not prioritize_performance) or model_params * 4 > free_model_memory:
            return True
@@ -1268,11 +1103,11 @@ def supports_fp8_compute(device=None):
    if props.minor < 9:
        return False

-    if torch_version_numeric[0] < 2 or (torch_version_numeric[0] == 2 and torch_version_numeric[1] < 3):
+    if int(torch_version[0]) < 2 or (int(torch_version[0]) == 2 and int(torch_version[2]) < 3):
        return False

    if WINDOWS:
-        if (torch_version_numeric[0] == 2 and torch_version_numeric[1] < 4):
+        if (int(torch_version[0]) == 2 and int(torch_version[2]) < 4):
            return False

    return True
@@ -1285,8 +1120,6 @@ def soft_empty_cache(force=False):
        torch.xpu.empty_cache()
    elif is_ascend_npu():
        torch.npu.empty_cache()
-    elif is_mlu():
-        torch.mlu.empty_cache()
    elif torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -96,28 +96,8 @@ def wipe_lowvram_weight(m):
    if hasattr(m, "prev_comfy_cast_weights"):
        m.comfy_cast_weights = m.prev_comfy_cast_weights
        del m.prev_comfy_cast_weights
-
-    if hasattr(m, "weight_function"):
-        m.weight_function = []
-
-    if hasattr(m, "bias_function"):
-        m.bias_function = []
-
-def move_weight_functions(m, device):
-    if device is None:
-        return 0
-
-    memory = 0
-    if hasattr(m, "weight_function"):
-        for f in m.weight_function:
-            if hasattr(f, "move_to"):
-                memory += f.move_to(device=device)
-
-    if hasattr(m, "bias_function"):
-        for f in m.bias_function:
-            if hasattr(f, "move_to"):
-                memory += f.move_to(device=device)
-    return memory
+    m.weight_function = None
+    m.bias_function = None

 class LowVramPatch:
    def __init__(self, key, patches):
@@ -212,13 +192,11 @@ class ModelPatcher:
        self.backup = {}
        self.object_patches = {}
        self.object_patches_backup = {}
-        self.weight_wrapper_patches = {}
        self.model_options = {"transformer_options":{}}
        self.model_size()
        self.load_device = load_device
        self.offload_device = offload_device
        self.weight_inplace_update = weight_inplace_update
-        self.force_cast_weights = False
        self.patches_uuid = uuid.uuid4()
        self.parent = None

@@ -272,14 +250,11 @@ class ModelPatcher:
        n.patches_uuid = self.patches_uuid

        n.object_patches = self.object_patches.copy()
-        n.weight_wrapper_patches = self.weight_wrapper_patches.copy()
        n.model_options = copy.deepcopy(self.model_options)
        n.backup = self.backup
        n.object_patches_backup = self.object_patches_backup
        n.parent = self

-        n.force_cast_weights = self.force_cast_weights
-
        # attachments
        n.attachments = {}
        for k in self.attachments:
@@ -427,16 +402,6 @@ class ModelPatcher:
    def add_object_patch(self, name, obj):
        self.object_patches[name] = obj

-    def set_model_compute_dtype(self, dtype):
-        self.add_object_patch("manual_cast_dtype", dtype)
-        if dtype is not None:
-            self.force_cast_weights = True
-        self.patches_uuid = uuid.uuid4() #TODO: optimize by preventing a full model reload for this
-
-    def add_weight_wrapper(self, name, function):
-        self.weight_wrapper_patches[name] = self.weight_wrapper_patches.get(name, []) + [function]
-        self.patches_uuid = uuid.uuid4()
-
    def get_model_object(self, name: str) -> torch.nn.Module:
        """Retrieves a nested attribute from an object using dot notation considering
        object patches.
@@ -601,9 +566,6 @@ class ModelPatcher:

                lowvram_weight = False

-                weight_key = "{}.weight".format(n)
-                bias_key = "{}.bias".format(n)
-
                if not full_load and hasattr(m, "comfy_cast_weights"):
                    if mem_counter + module_mem >= lowvram_model_memory:
                        lowvram_weight = True
@@ -611,46 +573,34 @@ class ModelPatcher:
                        if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed
                            continue

-                cast_weight = self.force_cast_weights
-                if lowvram_weight:
-                    if hasattr(m, "comfy_cast_weights"):
-                        m.weight_function = []
-                        m.bias_function = []
+                weight_key = "{}.weight".format(n)
+                bias_key = "{}.bias".format(n)

+                if lowvram_weight:
                    if weight_key in self.patches:
                        if force_patch_weights:
                            self.patch_weight_to_device(weight_key)
                        else:
-                            m.weight_function = [LowVramPatch(weight_key, self.patches)]
+                            m.weight_function = LowVramPatch(weight_key, self.patches)
                            patch_counter += 1
                    if bias_key in self.patches:
                        if force_patch_weights:
                            self.patch_weight_to_device(bias_key)
                        else:
-                            m.bias_function = [LowVramPatch(bias_key, self.patches)]
+                            m.bias_function = LowVramPatch(bias_key, self.patches)
                            patch_counter += 1

-                    cast_weight = True
+                    m.prev_comfy_cast_weights = m.comfy_cast_weights
+                    m.comfy_cast_weights = True
                else:
                    if hasattr(m, "comfy_cast_weights"):
-                        wipe_lowvram_weight(m)
+                        if m.comfy_cast_weights:
+                            wipe_lowvram_weight(m)

                    if full_load or mem_counter + module_mem < lowvram_model_memory:
                        mem_counter += module_mem
                        load_completely.append((module_mem, n, m, params))

-                if cast_weight and hasattr(m, "comfy_cast_weights"):
-                    m.prev_comfy_cast_weights = m.comfy_cast_weights
-                    m.comfy_cast_weights = True
-
-                if weight_key in self.weight_wrapper_patches:
-                    m.weight_function.extend(self.weight_wrapper_patches[weight_key])
-
-                if bias_key in self.weight_wrapper_patches:
-                    m.bias_function.extend(self.weight_wrapper_patches[bias_key])
-
-                mem_counter += move_weight_functions(m, device_to)
-
            load_completely.sort(reverse=True)
            for x in load_completely:
                n = x[1]
@@ -712,7 +662,6 @@ class ModelPatcher:
            self.unpatch_hooks()
            if self.model.model_lowvram:
                for m in self.model.modules():
-                    move_weight_functions(m, device_to)
                    wipe_lowvram_weight(m)

                self.model.model_lowvram = False
@@ -747,7 +696,6 @@ class ModelPatcher:

    def partially_unload(self, device_to, memory_to_free=0):
        with self.use_ejected():
-            hooks_unpatched = False
            memory_freed = 0
            patch_counter = 0
            unload_list = self._load_list()
@@ -771,10 +719,6 @@ class ModelPatcher:
                                move_weight = False
                                break

-                            if not hooks_unpatched:
-                                self.unpatch_hooks()
-                                hooks_unpatched = True
-
                            if bk.inplace_update:
                                comfy.utils.copy_to_param(self.model, key, bk.weight)
                            else:
@@ -784,19 +728,15 @@ class ModelPatcher:
                    weight_key = "{}.weight".format(n)
                    bias_key = "{}.bias".format(n)
                    if move_weight:
-                        cast_weight = self.force_cast_weights
                        m.to(device_to)
-                        module_mem += move_weight_functions(m, device_to)
                        if lowvram_possible:
                            if weight_key in self.patches:
-                                m.weight_function.append(LowVramPatch(weight_key, self.patches))
+                                m.weight_function = LowVramPatch(weight_key, self.patches)
                                patch_counter += 1
                            if bias_key in self.patches:
-                                m.bias_function.append(LowVramPatch(bias_key, self.patches))
+                                m.bias_function = LowVramPatch(bias_key, self.patches)
                                patch_counter += 1
-                            cast_weight = True

-                        if cast_weight:
                            m.prev_comfy_cast_weights = m.comfy_cast_weights
                            m.comfy_cast_weights = True
                        m.comfy_patched_weights = False
@@ -1094,6 +1034,7 @@ class ModelPatcher:

    def patch_hooks(self, hooks: comfy.hooks.HookGroup):
        with self.use_ejected():
+            self.unpatch_hooks()
            if hooks is not None:
                model_sd_keys = list(self.model_state_dict().keys())
                memory_counter = None
@@ -1104,16 +1045,12 @@ class ModelPatcher:
                # if have cached weights for hooks, use it
                cached_weights = self.cached_hook_patches.get(hooks, None)
                if cached_weights is not None:
-                    model_sd_keys_set = set(model_sd_keys)
                    for key in cached_weights:
                        if key not in model_sd_keys:
                            logging.warning(f"Cached hook could not patch. Key does not exist in model: {key}")
                            continue
                        self.patch_cached_hook_weights(cached_weights=cached_weights, key=key, memory_counter=memory_counter)
-                        model_sd_keys_set.remove(key)
-                    self.unpatch_hooks(model_sd_keys_set)
                else:
-                    self.unpatch_hooks()
                    relevant_patches = self.get_combined_hook_patches(hooks=hooks)
                    original_weights = None
                    if len(relevant_patches) > 0:
@@ -1124,8 +1061,6 @@ class ModelPatcher:
                            continue
                        self.patch_hook_weight_to_device(hooks=hooks, combined_patches=relevant_patches, key=key, original_weights=original_weights,
                                                            memory_counter=memory_counter)
-            else:
-                self.unpatch_hooks()
            self.current_hooks = hooks

    def patch_cached_hook_weights(self, cached_weights: dict, key: str, memory_counter: MemoryCounter):
@@ -1182,23 +1117,17 @@ class ModelPatcher:
        del out_weight
        del weight

-    def unpatch_hooks(self, whitelist_keys_set: set[str]=None) -> None:
+    def unpatch_hooks(self) -> None:
        with self.use_ejected():
            if len(self.hook_backup) == 0:
                self.current_hooks = None
                return
            keys = list(self.hook_backup.keys())
-            if whitelist_keys_set:
-                for k in keys:
-                    if k in whitelist_keys_set:
-                        comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
-                        self.hook_backup.pop(k)
-            else:
-                for k in keys:
-                    comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
+            for k in keys:
+                comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))

-                self.hook_backup.clear()
-                self.current_hooks = None
+            self.hook_backup.clear()
+            self.current_hooks = None

    def clean_hooks(self):
        self.unpatch_hooks()
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@@ -31,7 +31,6 @@ class EPS:
        return model_input - model_output * sigma

    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
-        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
        if max_denoise:
            noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
        else:
@@ -62,22 +61,11 @@ class CONST:
        return model_input - model_output * sigma

    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
-        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
        return sigma * noise + (1.0 - sigma) * latent_image

    def inverse_noise_scaling(self, sigma, latent):
-        sigma = sigma.view(sigma.shape[:1] + (1,) * (latent.ndim - 1))
        return latent / (1.0 - sigma)

-class X0(EPS):
-    def calculate_denoised(self, sigma, model_output, model_input):
-        return model_output
-
-class IMG_TO_IMG(X0):
-    def calculate_input(self, sigma, noise):
-        return noise
-
-
 class ModelSamplingDiscrete(torch.nn.Module):
    def __init__(self, model_config=None, zsnr=None):
        super().__init__()
@@ -111,14 +99,13 @@ class ModelSamplingDiscrete(torch.nn.Module):
        self.num_timesteps = int(timesteps)
        self.linear_start = linear_start
        self.linear_end = linear_end
-        self.zsnr = zsnr

        # self.register_buffer('betas', torch.tensor(betas, dtype=torch.float32))
        # self.register_buffer('alphas_cumprod', torch.tensor(alphas_cumprod, dtype=torch.float32))
        # self.register_buffer('alphas_cumprod_prev', torch.tensor(alphas_cumprod_prev, dtype=torch.float32))

        sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
-        if self.zsnr:
+        if zsnr:
            sigmas = rescale_zero_terminal_snr_sigmas(sigmas)

        self.set_sigmas(sigmas)
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -17,12 +17,9 @@
 """

 import torch
-import logging
 import comfy.model_management
-from comfy.cli_args import args, PerformanceFeature
+from comfy.cli_args import args
 import comfy.float
-import comfy.rmsnorm
-import contextlib

 cast_to = comfy.model_management.cast_to #TODO: remove once no more references

@@ -38,37 +35,24 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
        if device is None:
            device = input.device

-    offload_stream = comfy.model_management.get_offload_stream(device)
-    if offload_stream is not None:
-        wf_context = offload_stream
-    else:
-        wf_context = contextlib.nullcontext()
-
    bias = None
    non_blocking = comfy.model_management.device_supports_non_blocking(device)
    if s.bias is not None:
-        has_function = len(s.bias_function) > 0
-        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
-
+        has_function = s.bias_function is not None
+        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function)
        if has_function:
-            with wf_context:
-                for f in s.bias_function:
-                    bias = f(bias)
+            bias = s.bias_function(bias)

-    has_function = len(s.weight_function) > 0
-    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
+    has_function = s.weight_function is not None
+    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function)
    if has_function:
-        with wf_context:
-            for f in s.weight_function:
-                weight = f(weight)
-
-    comfy.model_management.sync_stream(device, offload_stream)
+        weight = s.weight_function(weight)
    return weight, bias

 class CastWeightBiasOp:
    comfy_cast_weights = False
-    weight_function = []
-    bias_function = []
+    weight_function = None
+    bias_function = None

 class disable_weight_init:
    class Linear(torch.nn.Linear, CastWeightBiasOp):
@@ -80,7 +64,7 @@ class disable_weight_init:
            return torch.nn.functional.linear(input, weight, bias)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -94,7 +78,7 @@ class disable_weight_init:
            return self._conv_forward(input, weight, bias)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -108,7 +92,7 @@ class disable_weight_init:
            return self._conv_forward(input, weight, bias)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -122,7 +106,7 @@ class disable_weight_init:
            return self._conv_forward(input, weight, bias)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -136,11 +120,12 @@ class disable_weight_init:
            return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)

+
    class LayerNorm(torch.nn.LayerNorm, CastWeightBiasOp):
        def reset_parameters(self):
            return None
@@ -154,26 +139,7 @@ class disable_weight_init:
            return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
-                return self.forward_comfy_cast_weights(*args, **kwargs)
-            else:
-                return super().forward(*args, **kwargs)
-
-    class RMSNorm(comfy.rmsnorm.RMSNorm, CastWeightBiasOp):
-        def reset_parameters(self):
-            self.bias = None
-            return None
-
-        def forward_comfy_cast_weights(self, input):
-            if self.weight is not None:
-                weight, bias = cast_bias_weight(self, input)
-            else:
-                weight = None
-            return comfy.rmsnorm.rms_norm(input, weight, self.eps)  # TODO: switch to commented out line when old torch is deprecated
-            # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
-
-        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -194,7 +160,7 @@ class disable_weight_init:
                output_padding, self.groups, self.dilation)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -215,7 +181,7 @@ class disable_weight_init:
                output_padding, self.groups, self.dilation)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
@@ -233,7 +199,7 @@ class disable_weight_init:
            return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)

        def forward(self, *args, **kwargs):
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                if "out_dtype" in kwargs:
@@ -275,9 +241,6 @@ class manual_cast(disable_weight_init):
    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
        comfy_cast_weights = True

-    class RMSNorm(disable_weight_init.RMSNorm):
-        comfy_cast_weights = True
-
    class Embedding(disable_weight_init.Embedding):
        comfy_cast_weights = True

@@ -308,10 +271,10 @@ def fp8_linear(self, input):
        if scale_input is None:
            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
            input = torch.clamp(input, min=-448, max=448, out=input)
-            input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
+            input = input.reshape(-1, input_shape[2]).to(dtype)
        else:
            scale_input = scale_input.to(input.device)
-            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous()
+            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype)

        if bias is not None:
            o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
@@ -344,7 +307,6 @@ class fp8_ops(manual_cast):
            return torch.nn.functional.linear(input, weight, bias)

 def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
-    logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
    class scaled_fp8_op(manual_cast):
        class Linear(manual_cast.Linear):
            def __init__(self, *args, **kwargs):
@@ -392,46 +354,14 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None

    return scaled_fp8_op

-CUBLAS_IS_AVAILABLE = False
-try:
-    from cublas_ops import CublasLinear
-    CUBLAS_IS_AVAILABLE = True
-except ImportError:
-    pass
-
-if CUBLAS_IS_AVAILABLE:
-    class cublas_ops(disable_weight_init):
-        class Linear(CublasLinear, disable_weight_init.Linear):
-            def reset_parameters(self):
-                return None
-
-            def forward_comfy_cast_weights(self, input):
-                return super().forward(input)
-
-            def forward(self, *args, **kwargs):
-                return super().forward(*args, **kwargs)
-
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None):
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
    if scaled_fp8 is not None:
-        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
+        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute, scale_input=True, override_dtype=scaled_fp8)

-    if (
-        fp8_compute and
-        (fp8_optimizations or PerformanceFeature.Fp8MatrixMultiplication in args.fast) and
-        not disable_fast_fp8
-    ):
+    if fp8_compute and (fp8_optimizations or args.fast) and not disable_fast_fp8:
        return fp8_ops

-    if (
-        PerformanceFeature.CublasOps in args.fast and
-        CUBLAS_IS_AVAILABLE and
-        weight_dtype == torch.float16 and
-        (compute_dtype == torch.float16 or compute_dtype is None)
-    ):
-        logging.info("Using cublas ops")
-        return cublas_ops
-
    if compute_dtype is None or weight_dtype == compute_dtype:
        return disable_weight_init

--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
@@ -48,7 +48,6 @@ def get_all_callbacks(call_type: str, transformer_options: dict, is_model_option

 class WrappersMP:
    OUTER_SAMPLE = "outer_sample"
-    PREPARE_SAMPLING = "prepare_sampling"
    SAMPLER_SAMPLE = "sampler_sample"
    CALC_COND_BATCH = "calc_cond_batch"
    APPLY_MODEL = "apply_model"
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
@@ -1,55 +0,0 @@
-import torch
-import comfy.model_management
-import numbers
-
-RMSNorm = None
-
-try:
-    rms_norm_torch = torch.nn.functional.rms_norm
-    RMSNorm = torch.nn.RMSNorm
-except:
-    rms_norm_torch = None
-
-
-def rms_norm(x, weight=None, eps=1e-6):
-    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
-        if weight is None:
-            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
-        else:
-            return rms_norm_torch(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
-    else:
-        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
-        if weight is None:
-            return r
-        else:
-            return r * comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device)
-
-
-if RMSNorm is None:
-    class RMSNorm(torch.nn.Module):
-        def __init__(
-            self,
-            normalized_shape,
-            eps=None,
-            elementwise_affine=True,
-            device=None,
-            dtype=None,
-        ):
-            factory_kwargs = {"device": device, "dtype": dtype}
-            super().__init__()
-            if isinstance(normalized_shape, numbers.Integral):
-                # mypy error: incompatible types in assignment
-                normalized_shape = (normalized_shape,)  # type: ignore[assignment]
-            self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
-            self.eps = eps
-            self.elementwise_affine = elementwise_affine
-            if self.elementwise_affine:
-                self.weight = torch.nn.Parameter(
-                    torch.empty(self.normalized_shape, **factory_kwargs)
-                )
-            else:
-                self.register_parameter("weight", None)
-            self.bias = None
-
-        def forward(self, x):
-            return rms_norm(x, self.weight, self.eps)
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -58,6 +58,7 @@ def convert_cond(cond):
        temp = c[1].copy()
        model_conds = temp.get("model_conds", {})
        if c[0] is not None:
+            model_conds["c_crossattn"] = comfy.conds.CONDCrossAttn(c[0]) #TODO: remove
            temp["cross_attn"] = c[0]
        temp["model_conds"] = model_conds
        temp["uuid"] = uuid.uuid4()
@@ -106,13 +107,6 @@ def cleanup_additional_models(models):


 def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
-    executor = comfy.patcher_extension.WrapperExecutor.new_executor(
-        _prepare_sampling,
-        comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
-    )
-    return executor.execute(model, noise_shape, conds, model_options=model_options)
-
-def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
    real_model: BaseModel = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
    models += get_additional_models_from_model_options(model_options)
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -12,6 +12,7 @@ import collections
 from comfy import model_management
 import math
 import logging
+import comfy.samplers
 import comfy.sampler_helpers
 import comfy.model_patcher
 import comfy.patcher_extension
@@ -19,12 +20,6 @@ import comfy.hooks
 import scipy.stats
 import numpy

-
-def add_area_dims(area, num_dims):
-    while (len(area) // 2) < num_dims:
-        area = [2147483648] + area[:len(area) // 2] + [0] + area[len(area) // 2:]
-    return area
-
 def get_area_and_mult(conds, x_in, timestep_in):
    dims = tuple(x_in.shape[2:])
    area = None
@@ -40,10 +35,6 @@ def get_area_and_mult(conds, x_in, timestep_in):
            return None
    if 'area' in conds:
        area = list(conds['area'])
-        area = add_area_dims(area, len(dims))
-        if (len(area) // 2) > len(dims):
-            area = area[:len(dims)] + area[len(area) // 2:(len(area) // 2) + len(dims)]
-
    if 'strength' in conds:
        strength = conds['strength']

@@ -60,7 +51,7 @@ def get_area_and_mult(conds, x_in, timestep_in):
        if "mask_strength" in conds:
            mask_strength = conds["mask_strength"]
        mask = conds['mask']
-        assert (mask.shape[1:] == x_in.shape[2:])
+        assert(mask.shape[1:] == x_in.shape[2:])

        mask = mask[:input_x.shape[0]]
        if area is not None:
@@ -74,17 +65,16 @@ def get_area_and_mult(conds, x_in, timestep_in):
    mult = mask * strength

    if 'mask' not in conds and area is not None:
-        fuzz = 8
+        rr = 8
        for i in range(len(dims)):
-            rr = min(fuzz, mult.shape[2 + i] // 4)
            if area[len(dims) + i] != 0:
                for t in range(rr):
                    m = mult.narrow(i + 2, t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))
            if (area[i] + area[len(dims) + i]) < x_in.shape[i + 2]:
                for t in range(rr):
                    m = mult.narrow(i + 2, area[i] - 1 - t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))

    conditioning = {}
    model_conds = conds["model_conds"]
@@ -188,7 +178,7 @@ def finalize_default_conds(model: 'BaseModel', hooked_to_run: dict[comfy.hooks.H
        cond = default_conds[i]
        for x in cond:
            # do get_area_and_mult to get all the expected values
-            p = get_area_and_mult(x, x_in, timestep)
+            p = comfy.samplers.get_area_and_mult(x, x_in, timestep)
            if p is None:
                continue
            # replace p's mult with calculated mult
@@ -225,7 +215,7 @@ def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Te
                    default_c.append(x)
                    has_default_conds = True
                    continue
-                p = get_area_and_mult(x, x_in, timestep)
+                p = comfy.samplers.get_area_and_mult(x, x_in, timestep)
                if p is None:
                    continue
                if p.hooks is not None:
@@ -559,37 +549,25 @@ def resolve_areas_and_cond_masks(conditions, h, w, device):
    logging.warning("WARNING: The comfy.samplers.resolve_areas_and_cond_masks function is deprecated please use the resolve_areas_and_cond_masks_multidim one instead.")
    return resolve_areas_and_cond_masks_multidim(conditions, [h, w], device)

-def create_cond_with_same_area_if_none(conds, c):
+def create_cond_with_same_area_if_none(conds, c): #TODO: handle dim != 2
    if 'area' not in c:
        return

-    def area_inside(a, area_cmp):
-        a = add_area_dims(a, len(area_cmp) // 2)
-        area_cmp = add_area_dims(area_cmp, len(a) // 2)
-
-        a_l = len(a) // 2
-        area_cmp_l = len(area_cmp) // 2
-        for i in range(min(a_l, area_cmp_l)):
-            if a[a_l + i] < area_cmp[area_cmp_l + i]:
-                return False
-        for i in range(min(a_l, area_cmp_l)):
-            if (a[i] + a[a_l + i]) > (area_cmp[i] + area_cmp[area_cmp_l + i]):
-                return False
-        return True
-
    c_area = c['area']
    smallest = None
    for x in conds:
        if 'area' in x:
            a = x['area']
-            if area_inside(c_area, a):
-                if smallest is None:
-                    smallest = x
-                elif 'area' not in smallest:
-                    smallest = x
-                else:
-                    if math.prod(smallest['area'][:len(smallest['area']) // 2]) > math.prod(a[:len(a) // 2]):
-                        smallest = x
+            if c_area[2] >= a[2] and c_area[3] >= a[3]:
+                if a[0] + a[2] >= c_area[0] + c_area[2]:
+                    if a[1] + a[3] >= c_area[1] + c_area[3]:
+                        if smallest is None:
+                            smallest = x
+                        elif 'area' not in smallest:
+                            smallest = x
+                        else:
+                            if smallest['area'][0] * smallest['area'][1] > a[0] * a[1]:
+                                smallest = x
        else:
            if smallest is None:
                smallest = x
@@ -709,8 +687,7 @@ class Sampler:
 KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral",
                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
-                  "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp",
-                  "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3"]
+                  "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp"]

 class KSAMPLER(Sampler):
    def __init__(self, sampler_function, extra_options={}, inpaint_options={}):
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-import json
 import torch
 from enum import Enum
 import logging
@@ -13,9 +12,6 @@ from .ldm.audio.autoencoder import AudioOobleckVAE
 import comfy.ldm.genmo.vae.model
 import comfy.ldm.lightricks.vae.causal_video_autoencoder
 import comfy.ldm.cosmos.vae
-import comfy.ldm.wan.vae
-import comfy.ldm.hunyuan3d.vae
-import comfy.ldm.ace.vae.music_dcae_pipeline
 import yaml
 import math

@@ -40,10 +36,6 @@ import comfy.text_encoders.genmo
 import comfy.text_encoders.lt
 import comfy.text_encoders.hunyuan_video
 import comfy.text_encoders.cosmos
-import comfy.text_encoders.lumina2
-import comfy.text_encoders.wan
-import comfy.text_encoders.hidream
-import comfy.text_encoders.ace

 import comfy.model_patcher
 import comfy.lora
@@ -122,7 +114,6 @@ class CLIP:
        self.layer_idx = None
        self.use_clip_schedule = False
        logging.info("CLIP/text encoder model load device: {}, offload device: {}, current: {}, dtype: {}".format(load_device, offload_device, params['device'], dtype))
-        self.tokenizer_options = {}

    def clone(self):
        n = CLIP(no_init=True)
@@ -130,7 +121,6 @@ class CLIP:
        n.cond_stage_model = self.cond_stage_model
        n.tokenizer = self.tokenizer
        n.layer_idx = self.layer_idx
-        n.tokenizer_options = self.tokenizer_options.copy()
        n.use_clip_schedule = self.use_clip_schedule
        n.apply_hooks_to_conds = self.apply_hooks_to_conds
        return n
@@ -138,19 +128,11 @@ class CLIP:
    def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
        return self.patcher.add_patches(patches, strength_patch, strength_model)

-    def set_tokenizer_option(self, option_name, value):
-        self.tokenizer_options[option_name] = value
-
    def clip_layer(self, layer_idx):
        self.layer_idx = layer_idx

-    def tokenize(self, text, return_word_ids=False, **kwargs):
-        tokenizer_options = kwargs.get("tokenizer_options", {})
-        if len(self.tokenizer_options) > 0:
-            tokenizer_options = {**self.tokenizer_options, **tokenizer_options}
-        if len(tokenizer_options) > 0:
-            kwargs["tokenizer_options"] = tokenizer_options
-        return self.tokenizer.tokenize_with_weights(text, return_word_ids, **kwargs)
+    def tokenize(self, text, return_word_ids=False):
+        return self.tokenizer.tokenize_with_weights(text, return_word_ids)

    def add_hooks_to_dict(self, pooled_dict: dict[str]):
        if self.apply_hooks_to_conds:
@@ -264,7 +246,7 @@ class CLIP:
        return self.patcher.get_key_patches()

 class VAE:
-    def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None):
+    def __init__(self, sd=None, device=None, config=None, dtype=None):
        if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
            sd = diffusers_convert.convert_vae_state_dict(sd)

@@ -278,11 +260,9 @@ class VAE:
        self.process_input = lambda image: image * 2.0 - 1.0
        self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
        self.working_dtypes = [torch.bfloat16, torch.float32]
-        self.disable_offload = False

        self.downscale_index_formula = None
        self.upscale_index_formula = None
-        self.extra_1d_channel = None

        if config is None:
            if "decoder.mid.block_1.mix_factor" in sd:
@@ -352,7 +332,6 @@ class VAE:
                self.process_output = lambda audio: audio
                self.process_input = lambda audio: audio
                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-                self.disable_offload = True
            elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd: #genmo mochi vae
                if "blocks.2.blocks.3.stack.5.weight" in sd:
                    sd = comfy.utils.state_dict_prefix_replace(sd, {"": "decoder."})
@@ -375,12 +354,7 @@ class VAE:
                    version = 0
                elif tensor_conv1.shape[0] == 1024:
                    version = 1
-                    if "encoder.down_blocks.1.conv.conv.bias" in sd:
-                        version = 2
-                vae_config = None
-                if metadata is not None and "config" in metadata:
-                    vae_config = json.loads(metadata["config"]).get("vae", None)
-                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version, config=vae_config)
+                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version)
                self.latent_channels = 128
                self.latent_dim = 3
                self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
@@ -414,46 +388,9 @@ class VAE:
                ddconfig = {'z_channels': 16, 'latent_channels': self.latent_channels, 'z_factor': 1, 'resolution': 1024, 'in_channels': 3, 'out_channels': 3, 'channels': 128, 'channels_mult': [2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [32], 'dropout': 0.0, 'patch_size': 4, 'num_groups': 1, 'temporal_compression': 8, 'spacial_compression': 8}
                self.first_stage_model = comfy.ldm.cosmos.vae.CausalContinuousVideoTokenizer(**ddconfig)
                #TODO: these values are a bit off because this is not a standard VAE
-                self.memory_used_decode = lambda shape, dtype: (50 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
-                self.memory_used_encode = lambda shape, dtype: (50 * (round((shape[2] + 7) / 8) * 8) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (220 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
+                self.memory_used_encode = lambda shape, dtype: (500 * max(shape[2], 2) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
                self.working_dtypes = [torch.bfloat16, torch.float32]
-            elif "decoder.middle.0.residual.0.gamma" in sd:
-                self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
-                self.upscale_index_formula = (4, 8, 8)
-                self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
-                self.downscale_index_formula = (4, 8, 8)
-                self.latent_dim = 3
-                self.latent_channels = 16
-                ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
-                self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig)
-                self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-                self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype)
-                self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype)
-            elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd:
-                self.latent_dim = 1
-                ln_post = "geo_decoder.ln_post.weight" in sd
-                inner_size = sd["geo_decoder.output_proj.weight"].shape[1]
-                downsample_ratio = sd["post_kl.weight"].shape[0] // inner_size
-                mlp_expand = sd["geo_decoder.cross_attn_decoder.mlp.c_fc.weight"].shape[0] // inner_size
-                self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype)  # TODO
-                self.memory_used_decode = lambda shape, dtype: (1024 * 1024 * 1024 * 2.0) * model_management.dtype_size(dtype)  # TODO
-                ddconfig = {"embed_dim": 64, "num_freqs": 8, "include_pi": False, "heads": 16, "width": 1024, "num_decoder_layers": 16, "qkv_bias": False, "qk_norm": True, "geo_decoder_mlp_expand_ratio": mlp_expand, "geo_decoder_downsample_ratio": downsample_ratio, "geo_decoder_ln_post": ln_post}
-                self.first_stage_model = comfy.ldm.hunyuan3d.vae.ShapeVAE(**ddconfig)
-                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-            elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio
-                self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100)
-                self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
-                self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
-                self.latent_channels = 8
-                self.output_channels = 2
-                self.upscale_ratio = 4096
-                self.downscale_ratio = 4096
-                self.latent_dim = 2
-                self.process_output = lambda audio: audio
-                self.process_input = lambda audio: audio
-                self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-                self.disable_offload = True
-                self.extra_1d_channel = 16
            else:
                logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
                self.first_stage_model = None
@@ -482,10 +419,6 @@ class VAE:
        self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device)
        logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype))

-    def throw_exception_if_invalid(self):
-        if self.first_stage_model is None:
-            raise RuntimeError("ERROR: VAE is invalid: None\n\nIf the VAE is from a checkpoint loader node your checkpoint does not contain a valid VAE.")
-
    def vae_encode_crop_pixels(self, pixels):
        downscale_ratio = self.spacial_compression_encode()

@@ -512,13 +445,7 @@ class VAE:
        return output

    def decode_tiled_1d(self, samples, tile_x=128, overlap=32):
-        if samples.ndim == 3:
-            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
-        else:
-            og_shape = samples.shape
-            samples = samples.reshape((og_shape[0], og_shape[1] * og_shape[2], -1))
-            decode_fn = lambda a: self.first_stage_model.decode(a.reshape((-1, og_shape[1], og_shape[2], a.shape[-1])).to(self.vae_dtype).to(self.device)).float()
-
+        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device))

    def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)):
@@ -538,49 +465,33 @@ class VAE:
        samples /= 3.0
        return samples

-    def encode_tiled_1d(self, samples, tile_x=256 * 2048, overlap=64 * 2048):
-        if self.latent_dim == 1:
-            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
-            out_channels = self.latent_channels
-            upscale_amount = 1 / self.downscale_ratio
-        else:
-            extra_channel_size = self.extra_1d_channel
-            out_channels = self.latent_channels * extra_channel_size
-            tile_x = tile_x // extra_channel_size
-            overlap = overlap // extra_channel_size
-            upscale_amount = 1 / self.downscale_ratio
-            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).reshape(1, out_channels, -1).float()
-
-        out = comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=upscale_amount, out_channels=out_channels, output_device=self.output_device)
-        if self.latent_dim == 1:
-            return out
-        else:
-            return out.reshape(samples.shape[0], self.latent_channels, extra_channel_size, -1)
+    def encode_tiled_1d(self, samples, tile_x=128 * 2048, overlap=32 * 2048):
+        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
+        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=(1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device)

    def encode_tiled_3d(self, samples, tile_t=9999, tile_x=512, tile_y=512, overlap=(1, 64, 64)):
        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, index_formulas=self.downscale_index_formula, output_device=self.output_device)

-    def decode(self, samples_in, vae_options={}):
-        self.throw_exception_if_invalid()
+    def decode(self, samples_in):
        pixel_samples = None
        try:
            memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
-            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / memory_used)
            batch_number = max(1, batch_number)

            for x in range(0, samples_in.shape[0], batch_number):
                samples = samples_in[x:x+batch_number].to(self.vae_dtype).to(self.device)
-                out = self.process_output(self.first_stage_model.decode(samples, **vae_options).to(self.output_device).float())
+                out = self.process_output(self.first_stage_model.decode(samples).to(self.output_device).float())
                if pixel_samples is None:
                    pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
                pixel_samples[x:x+batch_number] = out
        except model_management.OOM_EXCEPTION:
            logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
            dims = samples_in.ndim - 2
-            if dims == 1 or self.extra_1d_channel is not None:
+            if dims == 1:
                pixel_samples = self.decode_tiled_1d(samples_in)
            elif dims == 2:
                pixel_samples = self.decode_tiled_(samples_in)
@@ -593,9 +504,8 @@ class VAE:
        return pixel_samples

    def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
-        self.throw_exception_if_invalid()
        memory_used = self.memory_used_decode(samples.shape, self.vae_dtype) #TODO: calculate mem required for tile
-        model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+        model_management.load_models_gpu([self.patcher], memory_required=memory_used)
        dims = samples.ndim - 2
        args = {}
        if tile_x is not None:
@@ -622,14 +532,13 @@ class VAE:
        return output.movedim(1, -1)

    def encode(self, pixel_samples):
-        self.throw_exception_if_invalid()
        pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
        pixel_samples = pixel_samples.movedim(-1, 1)
        if self.latent_dim == 3 and pixel_samples.ndim < 5:
            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
        try:
            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
-            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / max(1, memory_used))
            batch_number = max(1, batch_number)
@@ -647,7 +556,7 @@ class VAE:
                tile = 256
                overlap = tile // 4
                samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap))
-            elif self.latent_dim == 1 or self.extra_1d_channel is not None:
+            elif self.latent_dim == 1:
                samples = self.encode_tiled_1d(pixel_samples)
            else:
                samples = self.encode_tiled_(pixel_samples)
@@ -655,7 +564,6 @@ class VAE:
        return samples

    def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
-        self.throw_exception_if_invalid()
        pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
        dims = self.latent_dim
        pixel_samples = pixel_samples.movedim(-1, 1)
@@ -663,7 +571,7 @@ class VAE:
            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)

        memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)  # TODO: calculate mem required for tile
-        model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+        model_management.load_models_gpu([self.patcher], memory_required=memory_used)

        args = {}
        if tile_x is not None:
@@ -749,11 +657,6 @@ class CLIPType(Enum):
    HUNYUAN_VIDEO = 9
    PIXART = 10
    COSMOS = 11
-    LUMINA2 = 12
-    WAN = 13
-    HIDREAM = 14
-    CHROMA = 15
-    ACE = 16


 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@@ -772,7 +675,6 @@ class TEModel(Enum):
    T5_BASE = 6
    LLAMA3_8 = 7
    T5_XXL_OLD = 8
-    GEMMA_2_2B = 9

 def detect_te_model(sd):
    if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@@ -791,8 +693,6 @@ def detect_te_model(sd):
        return TEModel.T5_XXL_OLD
    if "encoder.block.0.layer.0.SelfAttention.k.weight" in sd:
        return TEModel.T5_BASE
-    if 'model.layers.0.post_feedforward_layernorm.weight' in sd:
-        return TEModel.GEMMA_2_2B
    if "model.layers.0.post_attention_layernorm.weight" in sd:
        return TEModel.LLAMA3_8
    return None
@@ -830,7 +730,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            if "text_projection" in clip_data[i]:
                clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) #old models saved with the CLIPSave node

-    tokenizer_data = {}
    clip_target = EmptyClass()
    clip_target.params = {}
    if len(clip_data) == 1:
@@ -842,9 +741,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            elif clip_type == CLIPType.SD3:
                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
-            elif clip_type == CLIPType.HIDREAM:
-                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
-                clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
            else:
                clip_target.clip = sdxl_clip.SDXLRefinerClipModel
                clip_target.tokenizer = sdxl_clip.SDXLTokenizer
@@ -858,17 +754,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            elif clip_type == CLIPType.LTXV:
                clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
-            elif clip_type == CLIPType.PIXART or clip_type == CLIPType.CHROMA:
+            elif clip_type == CLIPType.PIXART:
                clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer
-            elif clip_type == CLIPType.WAN:
-                clip_target.clip = comfy.text_encoders.wan.te(**t5xxl_detect(clip_data))
-                clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer
-                tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
-            elif clip_type == CLIPType.HIDREAM:
-                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
-                                                                        clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
-                clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
            else: #CLIPType.MOCHI
                clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
@@ -879,29 +767,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model
            clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer
        elif te_model == TEModel.T5_BASE:
-            if clip_type == CLIPType.ACE or "spiece_model" in clip_data[0]:
-                clip_target.clip = comfy.text_encoders.ace.AceT5Model
-                clip_target.tokenizer = comfy.text_encoders.ace.AceT5Tokenizer
-                tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
-            else:
-                clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
-                clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
-        elif te_model == TEModel.GEMMA_2_2B:
-            clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data))
-            clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer
-            tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
-        elif te_model == TEModel.LLAMA3_8:
-            clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data),
-                                                                        clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
-            clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
+            clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
+            clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
        else:
-            # clip_l
            if clip_type == CLIPType.SD3:
                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
                clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
-            elif clip_type == CLIPType.HIDREAM:
-                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
-                clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
            else:
                clip_target.clip = sd1_clip.SD1ClipModel
                clip_target.tokenizer = sd1_clip.SD1Tokenizer
@@ -919,35 +790,15 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif clip_type == CLIPType.HUNYUAN_VIDEO:
            clip_target.clip = comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer
-        elif clip_type == CLIPType.HIDREAM:
-            # Detect
-            hidream_dualclip_classes = []
-            for hidream_te in clip_data:
-                te_model = detect_te_model(hidream_te)
-                hidream_dualclip_classes.append(te_model)
-
-            clip_l = TEModel.CLIP_L in hidream_dualclip_classes
-            clip_g = TEModel.CLIP_G in hidream_dualclip_classes
-            t5 = TEModel.T5_XXL in hidream_dualclip_classes
-            llama = TEModel.LLAMA3_8 in hidream_dualclip_classes
-
-            # Initialize t5xxl_detect and llama_detect kwargs if needed
-            t5_kwargs = t5xxl_detect(clip_data) if t5 else {}
-            llama_kwargs = llama_detect(clip_data) if llama else {}
-
-            clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, **t5_kwargs, **llama_kwargs)
-            clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
    elif len(clip_data) == 3:
        clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(**t5xxl_detect(clip_data))
        clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
-    elif len(clip_data) == 4:
-        clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), **llama_detect(clip_data))
-        clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer

    parameters = 0
+    tokenizer_data = {}
    for c in clip_data:
        parameters += comfy.utils.calculate_parameters(c)
        tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
@@ -994,13 +845,13 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
    return (model, clip, vae)

 def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
-    sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
-    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
+    sd = comfy.utils.load_torch_file(ckpt_path)
+    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
    return out

-def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
+def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    clip = None
    clipvision = None
    vae = None
@@ -1012,24 +863,19 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
    load_device = model_management.get_torch_device()

-    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
+    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
    if model_config is None:
-        logging.warning("Warning, This is not a checkpoint file, trying to load it as a diffusion model only.")
-        diffusion_model = load_diffusion_model_state_dict(sd, model_options={})
-        if diffusion_model is None:
-            return None
-        return (diffusion_model, None, VAE(sd={}), None)  # The VAE object is there to throw an exception if it's actually used'
-
+        return None

    unet_weight_dtype = list(model_config.supported_inference_dtypes)
-    if model_config.scaled_fp8 is not None:
-        weight_dtype = None
+    if weight_dtype is not None and model_config.scaled_fp8 is None:
+        unet_weight_dtype.append(weight_dtype)

    model_config.custom_operations = model_options.get("custom_operations", None)
    unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))

    if unet_dtype is None:
-        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
+        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)

    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
@@ -1046,7 +892,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    if output_vae:
        vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True)
        vae_sd = model_config.process_vae_state_dict(vae_sd)
-        vae = VAE(sd=vae_sd, metadata=metadata)
+        vae = VAE(sd=vae_sd)

    if output_clip:
        clip_target = model_config.clip_target(state_dict=sd)
@@ -1120,11 +966,11 @@ def load_diffusion_model_state_dict(sd, model_options={}): #load unet in diffuse

    offload_device = model_management.unet_offload_device()
    unet_weight_dtype = list(model_config.supported_inference_dtypes)
-    if model_config.scaled_fp8 is not None:
-        weight_dtype = None
+    if weight_dtype is not None and model_config.scaled_fp8 is None:
+        unet_weight_dtype.append(weight_dtype)

    if dtype is None:
-        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
+        unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)
    else:
        unet_dtype = dtype

--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -82,8 +82,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    LAYERS = [
        "last",
        "pooled",
-        "hidden",
-        "all"
+        "hidden"
    ]
    def __init__(self, device="cpu", max_length=77,
                 freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=comfy.clip_model.CLIPTextModel,
@@ -94,8 +93,6 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):

        if textmodel_json_config is None:
            textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_clip_config.json")
-            if "model_name" not in model_options:
-                model_options = {**model_options, "model_name": "clip_l"}

        if isinstance(textmodel_json_config, dict):
            config = textmodel_json_config
@@ -103,10 +100,6 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
            with open(textmodel_json_config) as f:
                config = json.load(f)

-        te_model_options = model_options.get("{}_model_config".format(model_options.get("model_name", "")), {})
-        for k, v in te_model_options.items():
-            config[k] = v
-
        operations = model_options.get("custom_operations", None)
        scaled_fp8 = None

@@ -154,9 +147,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    def set_clip_options(self, options):
        layer_idx = options.get("layer", self.layer_idx)
        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
-        if self.layer == "all":
-            pass
-        elif layer_idx is None or abs(layer_idx) > self.num_layers:
+        if layer_idx is None or abs(layer_idx) > self.num_layers:
            self.layer = "last"
        else:
            self.layer = "hidden"
@@ -167,98 +158,71 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        self.layer_idx = self.options_default[1]
        self.return_projected_pooled = self.options_default[2]

-    def process_tokens(self, tokens, device):
-        end_token = self.special_tokens.get("end", None)
-        if end_token is None:
-            cmp_token = self.special_tokens.get("pad", -1)
-        else:
-            cmp_token = end_token
-
-        embeds_out = []
-        attention_masks = []
-        num_tokens = []
+    def set_up_textual_embeddings(self, tokens, current_embeds):
+        out_tokens = []
+        next_new_token = token_dict_size = current_embeds.weight.shape[0]
+        embedding_weights = []

        for x in tokens:
-            attention_mask = []
            tokens_temp = []
-            other_embeds = []
-            eos = False
-            index = 0
            for y in x:
                if isinstance(y, numbers.Integral):
-                    if eos:
-                        attention_mask.append(0)
+                    tokens_temp += [int(y)]
+                else:
+                    if y.shape[0] == current_embeds.weight.shape[1]:
+                        embedding_weights += [y]
+                        tokens_temp += [next_new_token]
+                        next_new_token += 1
                    else:
-                        attention_mask.append(1)
-                    token = int(y)
-                    tokens_temp += [token]
-                    if not eos and token == cmp_token:
-                        if end_token is None:
-                            attention_mask[-1] = 0
-                        eos = True
-                else:
-                    other_embeds.append((index, y))
-                index += 1
+                        logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(y.shape[0], current_embeds.weight.shape[1]))
+            while len(tokens_temp) < len(x):
+                tokens_temp += [self.special_tokens["pad"]]
+            out_tokens += [tokens_temp]

-            tokens_embed = torch.tensor([tokens_temp], device=device, dtype=torch.long)
-            tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32)
-            index = 0
-            pad_extra = 0
-            for o in other_embeds:
-                emb = o[1]
-                if torch.is_tensor(emb):
-                    emb = {"type": "embedding", "data": emb}
+        n = token_dict_size
+        if len(embedding_weights) > 0:
+            new_embedding = self.operations.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
+            new_embedding.weight[:token_dict_size] = current_embeds.weight
+            for x in embedding_weights:
+                new_embedding.weight[n] = x
+                n += 1
+            self.transformer.set_input_embeddings(new_embedding)

-                emb_type = emb.get("type", None)
-                if emb_type == "embedding":
-                    emb = emb.get("data", None)
-                else:
-                    if hasattr(self.transformer, "preprocess_embed"):
-                        emb = self.transformer.preprocess_embed(emb, device=device)
-                    else:
-                        emb = None
+        processed_tokens = []
+        for x in out_tokens:
+            processed_tokens += [list(map(lambda a: n if a == -1 else a, x))] #The EOS token should always be the largest one

-                if emb is None:
-                    index += -1
-                    continue
-
-                ind = index + o[0]
-                emb = emb.view(1, -1, emb.shape[-1]).to(device=device, dtype=torch.float32)
-                emb_shape = emb.shape[1]
-                if emb.shape[-1] == tokens_embed.shape[-1]:
-                    tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1)
-                    attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:]
-                    index += emb_shape - 1
-                else:
-                    index += -1
-                    pad_extra += emb_shape
-                    logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(emb.shape[-1], tokens_embed.shape[-1]))
-
-            if pad_extra > 0:
-                padd_embed = self.transformer.get_input_embeddings()(torch.tensor([[self.special_tokens["pad"]] * pad_extra], device=device, dtype=torch.long), out_dtype=torch.float32)
-                tokens_embed = torch.cat([tokens_embed, padd_embed], dim=1)
-                attention_mask = attention_mask + [0] * pad_extra
-
-            embeds_out.append(tokens_embed)
-            attention_masks.append(attention_mask)
-            num_tokens.append(sum(attention_mask))
-
-        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens
+        return processed_tokens

    def forward(self, tokens):
-        device = self.transformer.get_input_embeddings().weight.device
-        embeds, attention_mask, num_tokens = self.process_tokens(tokens, device)
+        backup_embeds = self.transformer.get_input_embeddings()
+        device = backup_embeds.weight.device
+        tokens = self.set_up_textual_embeddings(tokens, backup_embeds)
+        tokens = torch.LongTensor(tokens).to(device)
+
+        attention_mask = None
+        if self.enable_attention_masks or self.zero_out_masked or self.return_attention_masks:
+            attention_mask = torch.zeros_like(tokens)
+            end_token = self.special_tokens.get("end", None)
+            if end_token is None:
+                cmp_token = self.special_tokens.get("pad", -1)
+            else:
+                cmp_token = end_token
+
+            for x in range(attention_mask.shape[0]):
+                for y in range(attention_mask.shape[1]):
+                    attention_mask[x, y] = 1
+                    if tokens[x, y] == cmp_token:
+                        if end_token is None:
+                            attention_mask[x, y] = 0
+                        break

        attention_mask_model = None
        if self.enable_attention_masks:
            attention_mask_model = attention_mask

-        if self.layer == "all":
-            intermediate_output = "all"
-        else:
-            intermediate_output = self.layer_idx
-
-        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        self.transformer.set_input_embeddings(backup_embeds)

        if self.layer == "last":
            z = outputs[0].float()
@@ -457,14 +421,13 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
    return embed_out

 class SDTokenizer:
-    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data={}, tokenizer_args={}):
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, tokenizer_data={}):
        if tokenizer_path is None:
            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
-        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
-        self.max_length = tokenizer_data.get("{}_max_length".format(embedding_key), max_length)
+        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path)
+        self.max_length = max_length
        self.min_length = min_length
        self.end_token = None
-        self.min_padding = min_padding

        empty = self.tokenizer('')["input_ids"]
        self.tokenizer_adds_end_token = has_end_token
@@ -519,15 +482,13 @@ class SDTokenizer:
        return (embed, leftover)


-    def tokenize_with_weights(self, text:str, return_word_ids=False, tokenizer_options={}, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        '''
        Takes a prompt and converts it to a list of (token, weight, word id) elements.
        Tokens can both be integer tokens and pre computed CLIP tensors.
        Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens.
        Returned list has the dimensions NxM where M is the input size of CLIP
        '''
-        min_length = tokenizer_options.get("{}_min_length".format(self.embedding_key), self.min_length)
-        min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding)

        text = escape_important(text)
        parsed_weights = token_weights(text, 1.0)
@@ -606,12 +567,10 @@ class SDTokenizer:
        #fill last batch
        if self.end_token is not None:
            batch.append((self.end_token, 1.0, 0))
-        if min_padding is not None:
-            batch.extend([(self.pad_token, 1.0, 0)] * min_padding)
-        if self.pad_to_max_length and len(batch) < self.max_length:
+        if self.pad_to_max_length:
            batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch)))
-        if min_length is not None and len(batch) < min_length:
-            batch.extend([(self.pad_token, 1.0, 0)] * (min_length - len(batch)))
+        if self.min_length is not None and len(batch) < self.min_length:
+            batch.extend([(self.pad_token, 1.0, 0)] * (self.min_length - len(batch)))

        if not return_word_ids:
            batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens]
@@ -626,27 +585,22 @@ class SDTokenizer:
        return {}

 class SD1Tokenizer:
-    def __init__(self, embedding_directory=None, tokenizer_data={}, clip_name="l", tokenizer=SDTokenizer, name=None):
-        if name is not None:
-            self.clip_name = name
-            self.clip = "{}".format(self.clip_name)
-        else:
-            self.clip_name = clip_name
-            self.clip = "clip_{}".format(self.clip_name)
-
+    def __init__(self, embedding_directory=None, tokenizer_data={}, clip_name="l", tokenizer=SDTokenizer):
+        self.clip_name = clip_name
+        self.clip = "clip_{}".format(self.clip_name)
        tokenizer = tokenizer_data.get("{}_tokenizer_class".format(self.clip), tokenizer)
        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data))

-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
-        out[self.clip_name] = getattr(self, self.clip).tokenize_with_weights(text, return_word_ids, **kwargs)
+        out[self.clip_name] = getattr(self, self.clip).tokenize_with_weights(text, return_word_ids)
        return out

    def untokenize(self, token_weight_pair):
        return getattr(self, self.clip).untokenize(token_weight_pair)

    def state_dict(self):
-        return getattr(self, self.clip).state_dict()
+        return {}

 class SD1CheckpointClipModel(SDClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}):
@@ -664,7 +618,6 @@ class SD1ClipModel(torch.nn.Module):
            self.clip = "clip_{}".format(self.clip_name)

        clip_model = model_options.get("{}_class".format(self.clip), clip_model)
-        model_options = {**model_options, "model_name": self.clip}
        setattr(self, self.clip, clip_model(device=device, dtype=dtype, model_options=model_options, **kwargs))

        self.dtypes = set()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
pythongosssss	fde9fdddff	Allow running with non working	2025-03-28 11:46:05 +08:00
pythongosssss	7bf381bc9e	Add model management and database - use sqlalchemy + alembic + sqlite for db - extract model data and previews - endpoints for db interactions - add tests	2025-03-28 11:39:56 +08:00