Update templates 0.1.22

Add BFL Kontext API Nodes. (#8333 )
* Added initial Flux.1 Kontext Pro Image node - recreated branch to save myself sanity from rebase crap after master got rebased * Add safety filter to Kontext. * Make safety = 2 and input image is optional. * Add BFL kontext API nodes. --------- Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
2025-05-30 03:38:07 +10:00 · 2025-05-29 13:27:40 -04:00 · 2025-05-29 08:29:37 -04:00 · 2025-05-29 03:03:11 -04:00 · 2025-05-28 23:46:15 -04:00 · 2025-05-28 23:42:25 -04:00
508 changed files with 690628 additions and 54284 deletions
--- a/.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat
+++ b/.ci/nightly/update_windows/update_comfyui_and_python_dependencies.bat
@@ -1,3 +0,0 @@
-..\python_embeded\python.exe .\update.py ..\ComfyUI\
-..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -r ../ComfyUI/requirements.txt pygit2
-pause
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -1,6 +1,9 @@
 import pygit2
 from datetime import datetime
 import sys
+import os
+import shutil
+import filecmp

 def pull(repo, remote_name='origin', branch='master'):
    for remote in repo.remotes:
@@ -25,41 +28,124 @@ def pull(repo, remote_name='origin', branch='master'):

                if repo.index.conflicts is not None:
                    for conflict in repo.index.conflicts:
-                        print('Conflicts found in:', conflict[0].path)
+                        print('Conflicts found in:', conflict[0].path)  # noqa: T201
                    raise AssertionError('Conflicts, ahhhhh!!')

                user = repo.default_signature
                tree = repo.index.write_tree()
-                commit = repo.create_commit('HEAD',
-                                            user,
-                                            user,
-                                            'Merge!',
-                                            tree,
-                                            [repo.head.target, remote_master_id])
+                repo.create_commit('HEAD',
+                                    user,
+                                    user,
+                                    'Merge!',
+                                    tree,
+                                    [repo.head.target, remote_master_id])
                # We need to do this or git CLI will think we are still merging.
                repo.state_cleanup()
            else:
                raise AssertionError('Unknown merge analysis result')

-
-repo = pygit2.Repository(str(sys.argv[1]))
+pygit2.option(pygit2.GIT_OPT_SET_OWNER_VALIDATION, 0)
+repo_path = str(sys.argv[1])
+repo = pygit2.Repository(repo_path)
 ident = pygit2.Signature('comfyui', 'comfy@ui')
 try:
-    print("stashing current changes")
+    print("stashing current changes")  # noqa: T201
    repo.stash(ident)
 except KeyError:
-    print("nothing to stash")
+    print("nothing to stash")  # noqa: T201
 backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S'))
-print("creating backup branch: {}".format(backup_branch_name))
-repo.branches.local.create(backup_branch_name, repo.head.peel())
+print("creating backup branch: {}".format(backup_branch_name))  # noqa: T201
+try:
+    repo.branches.local.create(backup_branch_name, repo.head.peel())
+except:
+    pass

-print("checking out master branch")
+print("checking out master branch")  # noqa: T201
 branch = repo.lookup_branch('master')
-ref = repo.lookup_reference(branch.name)
-repo.checkout(ref)
+if branch is None:
+    try:
+        ref = repo.lookup_reference('refs/remotes/origin/master')
+    except:
+        print("pulling.")  # noqa: T201
+        pull(repo)
+        ref = repo.lookup_reference('refs/remotes/origin/master')
+    repo.checkout(ref)
+    branch = repo.lookup_branch('master')
+    if branch is None:
+        repo.create_branch('master', repo.get(ref.target))
+else:
+    ref = repo.lookup_reference(branch.name)
+    repo.checkout(ref)

-print("pulling latest changes")
+print("pulling latest changes")  # noqa: T201
 pull(repo)

-print("Done!")
+if "--stable" in sys.argv:
+    def latest_tag(repo):
+        versions = []
+        for k in repo.references:
+            try:
+                prefix = "refs/tags/v"
+                if k.startswith(prefix):
+                    version = list(map(int, k[len(prefix):].split(".")))
+                    versions.append((version[0] * 10000000000 + version[1] * 100000 + version[2], k))
+            except:
+                pass
+        versions.sort()
+        if len(versions) > 0:
+            return versions[-1][1]
+        return None
+    latest_tag = latest_tag(repo)
+    if latest_tag is not None:
+        repo.checkout(latest_tag)

+print("Done!")  # noqa: T201
+
+self_update = True
+if len(sys.argv) > 2:
+    self_update = '--skip_self_update' not in sys.argv
+
+update_py_path = os.path.realpath(__file__)
+repo_update_py_path = os.path.join(repo_path, ".ci/update_windows/update.py")
+
+cur_path = os.path.dirname(update_py_path)
+
+
+req_path = os.path.join(cur_path, "current_requirements.txt")
+repo_req_path = os.path.join(repo_path, "requirements.txt")
+
+
+def files_equal(file1, file2):
+    try:
+        return filecmp.cmp(file1, file2, shallow=False)
+    except:
+        return False
+
+def file_size(f):
+    try:
+        return os.path.getsize(f)
+    except:
+        return 0
+
+
+if self_update and not files_equal(update_py_path, repo_update_py_path) and file_size(repo_update_py_path) > 10:
+    shutil.copy(repo_update_py_path, os.path.join(cur_path, "update_new.py"))
+    exit()
+
+if not os.path.exists(req_path) or not files_equal(repo_req_path, req_path):
+    import subprocess
+    try:
+        subprocess.check_call([sys.executable, '-s', '-m', 'pip', 'install', '-r', repo_req_path])
+        shutil.copy(repo_req_path, req_path)
+    except:
+        pass
+
+
+stable_update_script = os.path.join(repo_path, ".ci/update_windows/update_comfyui_stable.bat")
+stable_update_script_to = os.path.join(cur_path, "update_comfyui_stable.bat")
+
+try:
+    if not file_size(stable_update_script_to) > 10:
+        shutil.copy(stable_update_script, stable_update_script_to)
+except:
+    pass
--- a/.ci/update_windows/update_comfyui.bat
+++ b/.ci/update_windows/update_comfyui.bat
@@ -1,2 +1,8 @@
+@echo off
 ..\python_embeded\python.exe .\update.py ..\ComfyUI\
-pause
+if exist update_new.py (
+  move /y update_new.py update.py
+  echo Running updater again since it got updated.
+  ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --skip_self_update
+)
+if "%~1"=="" pause
--- a/.ci/update_windows/update_comfyui_and_python_dependencies.bat
+++ b/.ci/update_windows/update_comfyui_and_python_dependencies.bat
@@ -1,3 +0,0 @@
-..\python_embeded\python.exe .\update.py ..\ComfyUI\
-..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117 xformers -r ../ComfyUI/requirements.txt pygit2
-pause
--- a/.ci/update_windows/update_comfyui_stable.bat
+++ b/.ci/update_windows/update_comfyui_stable.bat
@@ -0,0 +1,8 @@
+@echo off
+..\python_embeded\python.exe .\update.py ..\ComfyUI\ --stable
+if exist update_new.py (
+  move /y update_new.py update.py
+  echo Running updater again since it got updated.
+  ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --skip_self_update --stable
+)
+if "%~1"=="" pause
--- a/.ci/update_windows_cu118/update_comfyui_and_python_dependencies.bat
+++ b/.ci/update_windows_cu118/update_comfyui_and_python_dependencies.bat
@@ -1,11 +0,0 @@
-@echo off
-..\python_embeded\python.exe .\update.py ..\ComfyUI\
-echo
-echo This will try to update pytorch and all python dependencies, if you get an error wait for pytorch/xformers to fix their stuff
-echo You should not be running this anyways unless you really have to
-echo
-echo If you just want to update normally, close this and run update_comfyui.bat instead.
-echo
-pause
-..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 xformers -r ../ComfyUI/requirements.txt pygit2
-pause
--- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
@@ -14,7 +14,7 @@ run_cpu.bat

 IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints

-You can download the stable diffusion 1.5 one from: https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt
+You can download the stable diffusion 1.5 one from: https://huggingface.co/Comfy-Org/stable-diffusion-v1-5-archive/blob/main/v1-5-pruned-emaonly-fp16.safetensors


 RECOMMENDED WAY TO UPDATE:
--- a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@@ -1,2 +1,2 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --use-pytorch-cross-attention
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
 pause
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
@@ -0,0 +1,2 @@
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast
+pause
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+/web/assets/** linguist-generated
+/web/** linguist-vendored
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,48 @@
+name: Bug Report
+description: "Something is broken inside of ComfyUI. (Do not use this if you're just having issues and need help, or if the issue relates to a custom node)"
+labels: ["Potential Bug"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Before submitting a **Bug Report**, please ensure the following:
+
+        - **1:** You are running the latest version of ComfyUI.
+        - **2:** You have looked at the existing bug reports and made sure this isn't already reported.
+        - **3:** You confirmed that the bug is not caused by a custom node. You can disable all custom nodes by passing
+        `--disable-all-custom-nodes` command line argument.
+        - **4:** This is an actual bug in ComfyUI, not just a support question. A bug is when you can specify exact
+        steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen.
+
+        If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
+  - type: textarea
+    attributes:
+      label: Expected Behavior
+      description: "What you expected to happen."
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Actual Behavior
+      description: "What actually happened. Please include a screenshot of the issue if possible."
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Steps to Reproduce
+      description: "Describe how to reproduce the issue. Please be sure to attach a workflow JSON or PNG, ideally one that doesn't require custom nodes to test. If the bug open happens when certain custom nodes are used, most likely that custom node is what has the bug rather than ComfyUI, in which case it should be reported to the node's author."
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Debug Logs
+      description: "Please copy the output from your terminal logs here."
+      render: powershell
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Other
+      description: "Any other additional information you think might be helpful."
+    validations:
+      required: false
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,11 @@
+blank_issues_enabled: true
+contact_links:
+  - name: ComfyUI Frontend Issues
+    url: https://github.com/Comfy-Org/ComfyUI_frontend/issues
+    about: Issues related to the ComfyUI frontend (display issues, user interaction bugs), please go to the frontend repo to file the issue
+  - name: ComfyUI Matrix Space
+    url: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
+    about: The ComfyUI Matrix Space is available for support and general discussion related to ComfyUI (Matrix is like Discord but open source).
+  - name: Comfy Org Discord
+    url: https://discord.gg/comfyorg
+    about: The Comfy Org Discord is available for support and general discussion related to ComfyUI.
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -0,0 +1,32 @@
+name: Feature Request
+description: "You have an idea for something new you would like to see added to ComfyUI's core."
+labels: [ "Feature" ]
+body:
+    - type: markdown
+      attributes:
+        value: |
+                Before submitting a **Feature Request**, please ensure the following:
+
+                **1:** You are running the latest version of ComfyUI.
+                **2:** You have looked to make sure there is not already a feature that does what you need, and there is not already a Feature Request listed for the same idea.
+                **3:** This is something that makes sense to add to ComfyUI Core, and wouldn't make more sense as a custom node.
+
+                If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
+    - type: textarea
+      attributes:
+            label: Feature Idea
+            description: "Describe the feature you want to see."
+      validations:
+            required: true
+    - type: textarea
+      attributes:
+                label: Existing Solutions
+                description: "Please search through available custom nodes / extensions to see if there are existing custom solutions for this. If so, please link the options you found here as a reference."
+      validations:
+                required: false
+    - type: textarea
+      attributes:
+                label: Other
+                description: "Any other additional information you think might be helpful."
+      validations:
+                required: false
--- a/.github/ISSUE_TEMPLATE/user-support.yml
+++ b/.github/ISSUE_TEMPLATE/user-support.yml
@@ -0,0 +1,32 @@
+name: User Support
+description: "Use this if you need help with something, or you're experiencing an issue."
+labels: [ "User Support" ]
+body:
+    - type: markdown
+      attributes:
+        value: |
+            Before submitting a **User Report** issue, please ensure the following:
+
+            **1:** You are running the latest version of ComfyUI.
+            **2:** You have made an effort to find public answers to your question before asking here. In other words, you googled it first, and scrolled through recent help topics.
+
+                If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first.
+    - type: textarea
+      attributes:
+            label: Your question
+            description: "Post your question here. Please be as detailed as possible."
+      validations:
+            required: true
+    - type: textarea
+      attributes:
+                label: Logs
+                description: "If your question relates to an issue you're experiencing, please go to `Server` -> `Logs` -> potentially set `View Type` to `Debug` as well, then copypaste all the text into here."
+                render: powershell
+      validations:
+                required: false
+    - type: textarea
+      attributes:
+                label: Other
+                description: "Any other additional information you think might be helpful."
+      validations:
+                required: false
--- a/.github/workflows/pullrequest-ci-run.yml
+++ b/.github/workflows/pullrequest-ci-run.yml
@@ -0,0 +1,53 @@
+# This is the GitHub Workflow that drives full-GPU-enabled tests of pull requests to ComfyUI, when the 'Run-CI-Test' label is added
+# Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
+name: Pull Request CI Workflow Runs
+on:
+    pull_request_target:
+        types: [labeled]
+
+jobs:
+  pr-test-stable:
+    if: ${{ github.event.label.name == 'Run-CI-Test' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos, linux, windows]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        cuda_version: ["12.1"]
+        torch_version: ["stable"]
+        include:
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
+          - os: linux
+            runner_label: [self-hosted, Linux]
+            flags: ""
+          - os: windows
+            runner_label: [self-hosted, Windows]
+            flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}
+          use_prior_commit: 'true'
+  comment:
+    if: ${{ github.event.label.name == 'Run-CI-Test' }}
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: '(Automated Bot Message) CI Tests are running, you can view the results at https://ci.comfy.org/?branch=${{ github.event.pull_request.number }}%2Fmerge'
+            })
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -0,0 +1,23 @@
+name: Python Linting
+
+on: [push, pull_request]
+
+jobs:
+  ruff:
+    name: Run Ruff
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.x
+
+    - name: Install Ruff
+      run: pip install ruff
+
+    - name: Run Ruff
+      run: ruff check .
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -0,0 +1,106 @@
+
+name: "Release Stable Version"
+
+on:
+  workflow_dispatch:
+    inputs:
+      git_tag:
+        description: 'Git tag'
+        required: true
+        type: string
+      cu:
+        description: 'CUDA version'
+        required: true
+        type: string
+        default: "128"
+      python_minor:
+        description: 'Python minor version'
+        required: true
+        type: string
+        default: "12"
+      python_patch:
+        description: 'Python patch version'
+        required: true
+        type: string
+        default: "10"
+
+
+jobs:
+  package_comfy_windows:
+    permissions:
+      contents: "write"
+      packages: "write"
+      pull-requests: "read"
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.git_tag }}
+          fetch-depth: 150
+          persist-credentials: false
+      - uses: actions/cache/restore@v4
+        id: cache
+        with:
+          path: |
+            cu${{ inputs.cu }}_python_deps.tar
+            update_comfyui_and_python_dependencies.bat
+          key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
+      - shell: bash
+        run: |
+          mv cu${{ inputs.cu }}_python_deps.tar ../
+          mv update_comfyui_and_python_dependencies.bat ../
+          cd ..
+          tar xf cu${{ inputs.cu }}_python_deps.tar
+          pwd
+          ls
+
+      - shell: bash
+        run: |
+          cd ..
+          cp -r ComfyUI ComfyUI_copy
+          curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
+          unzip python_embeded.zip -d python_embeded
+          cd python_embeded
+          echo ${{ env.MINOR_VERSION }}
+          echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
+          curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+          ./python.exe get-pip.py
+          ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
+            cd ..
+
+          git clone --depth 1 https://github.com/comfyanonymous/taesd
+          cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+
+          mkdir ComfyUI_windows_portable
+          mv python_embeded ComfyUI_windows_portable
+          mv ComfyUI_copy ComfyUI_windows_portable/ComfyUI
+
+          cd ComfyUI_windows_portable
+
+          mkdir update
+          cp -r ComfyUI/.ci/update_windows/* ./update/
+          cp -r ComfyUI/.ci/windows_base_files/* ./
+          cp ../update_comfyui_and_python_dependencies.bat ./update/
+
+          cd ..
+
+          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+          mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z
+
+          cd ComfyUI_windows_portable
+          python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu
+
+          python_embeded/python.exe -s ./update/update.py ComfyUI/
+
+          ls
+
+      - name: Upload binaries to release
+        uses: svenstaro/upload-release-action@v2
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ComfyUI_windows_portable_nvidia.7z
+          tag: ${{ inputs.git_tag }}
+          overwrite: true
+          prerelease: true
+          make_latest: false
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -0,0 +1,21 @@
+name: 'Close stale issues'
+on:
+  schedule:
+    # Run daily at 430 am PT
+    - cron: '30 11 * * *'
+permissions:
+  issues: write
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v9
+        with:
+          stale-issue-message: "This issue is being marked stale because it has not had any activity for 30 days. Reply below within 7 days if your issue still isn't solved, and it will be left open. Otherwise, the issue will be closed automatically."
+          days-before-stale: 30
+          days-before-close: 7
+          stale-issue-label: 'Stale'
+          only-labels: 'User Support'
+          exempt-all-assignees: true
+          exempt-all-milestones: true
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -0,0 +1,31 @@
+name: Build package
+
+#
+# This workflow is a test of the python package build.
+# Install Python dependencies across different Python versions.
+#
+
+on:
+  push:
+    paths:
+      - "requirements.txt"
+      - ".github/workflows/test-build.yml"
+
+jobs:
+  build:
+    name: Build Test
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@@ -0,0 +1,96 @@
+# This is the GitHub Workflow that drives automatic full-GPU-enabled tests of all new commits to the master branch of ComfyUI
+# Results are reported as checkmarks on the commits, as well as onto https://ci.comfy.org/
+name: Full Comfy CI Workflow Runs
+on:
+  push:
+    branches:
+      - master
+    paths-ignore:
+      - 'app/**'
+      - 'input/**'
+      - 'output/**'
+      - 'notebooks/**'
+      - 'script_examples/**'
+      - '.github/**'
+      - 'web/**'
+  workflow_dispatch:
+
+jobs:
+  test-stable:
+    strategy:
+      fail-fast: false
+      matrix:
+        # os: [macos, linux, windows]
+        os: [macos, linux]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        cuda_version: ["12.1"]
+        torch_version: ["stable"]
+        include:
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
+          - os: linux
+            runner_label: [self-hosted, Linux]
+            flags: ""
+          # - os: windows
+          #   runner_label: [self-hosted, Windows]
+          #   flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}
+
+  # test-win-nightly:
+  #   strategy:
+  #     fail-fast: true
+  #     matrix:
+  #       os: [windows]
+  #       python_version: ["3.9", "3.10", "3.11", "3.12"]
+  #       cuda_version: ["12.1"]
+  #       torch_version: ["nightly"]
+  #       include:
+  #         - os: windows
+  #           runner_label: [self-hosted, Windows]
+  #           flags: ""
+  #   runs-on: ${{ matrix.runner_label }}
+  #   steps:
+  #     - name: Test Workflows
+  #       uses: comfy-org/comfy-action@main
+  #       with:
+  #         os: ${{ matrix.os }}
+  #         python_version: ${{ matrix.python_version }}
+  #         torch_version: ${{ matrix.torch_version }}
+  #         google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+  #         comfyui_flags: ${{ matrix.flags }}
+
+  test-unix-nightly:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos, linux]
+        python_version: ["3.11"]
+        cuda_version: ["12.1"]
+        torch_version: ["nightly"]
+        include:
+          - os: macos
+            runner_label: [self-hosted, macOS]
+            flags: "--use-pytorch-cross-attention"
+          - os: linux
+            runner_label: [self-hosted, Linux]
+            flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -0,0 +1,45 @@
+name: Test server launches without errors
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout ComfyUI
+      uses: actions/checkout@v4
+      with:
+        repository: "comfyanonymous/ComfyUI"
+        path: "ComfyUI"
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+        pip install wait-for-it
+      working-directory: ComfyUI
+    - name: Start ComfyUI server
+      run: |
+        python main.py --cpu 2>&1 | tee console_output.log &
+        wait-for-it --service 127.0.0.1:8188 -t 30
+      working-directory: ComfyUI
+    - name: Check for unhandled exceptions in server log
+      run: |
+        if grep -qE "Exception|Error" console_output.log; then
+          echo "Unhandled exception/error found in server log."
+          exit 1
+        fi
+      working-directory: ComfyUI
+    - uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: console-output
+        path: ComfyUI/console_output.log
+        retention-days: 30
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@@ -0,0 +1,30 @@
+name: Unit Tests
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    continue-on-error: true
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python      
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+    - name: Run Unit Tests
+      run: |
+        pip install -r tests-unit/requirements.txt
+        python -m pytest tests-unit
--- a/.github/workflows/update-api-stubs.yml
+++ b/.github/workflows/update-api-stubs.yml
@@ -0,0 +1,56 @@
+name: Generate Pydantic Stubs from api.comfy.org
+
+on:
+  schedule:
+    - cron: '0 0 * * 1'
+  workflow_dispatch:
+
+jobs:
+  generate-models:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install 'datamodel-code-generator[http]'
+          npm install @redocly/cli
+      
+      - name: Download OpenAPI spec
+        run: |
+          curl -o openapi.yaml https://api.comfy.org/openapi
+      
+      - name: Filter OpenAPI spec with Redocly
+        run: |
+          npx @redocly/cli bundle openapi.yaml --output filtered-openapi.yaml --config comfy_api_nodes/redocly.yaml --remove-unused-components
+      
+      - name: Generate API models
+        run: |
+          datamodel-codegen --use-subclass-enum --input filtered-openapi.yaml --output comfy_api_nodes/apis --output-model-type pydantic_v2.BaseModel
+      
+      - name: Check for changes
+        id: git-check
+        run: |
+          git diff --exit-code comfy_api_nodes/apis || echo "changes=true" >> $GITHUB_OUTPUT
+      
+      - name: Create Pull Request
+        if: steps.git-check.outputs.changes == 'true'
+        uses: peter-evans/create-pull-request@v5
+        with:
+          commit-message: 'chore: update API models from OpenAPI spec'
+          title: 'Update API models from api.comfy.org'
+          body: |
+            This PR updates the API models based on the latest api.comfy.org OpenAPI specification.
+            
+            Generated automatically by the a Github workflow.
+          branch: update-api-stubs
+          delete-branch: true
+          base: master
--- a/.github/workflows/update-version.yml
+++ b/.github/workflows/update-version.yml
@@ -0,0 +1,58 @@
+name: Update Version File
+
+on:
+  pull_request:
+    paths:
+      - "pyproject.toml"
+    branches:
+      - master
+
+jobs:
+  update-version:
+    runs-on: ubuntu-latest
+    # Don't run on fork PRs
+    if: github.event.pull_request.head.repo.full_name == github.repository
+    permissions:
+      pull-requests: write
+      contents: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+
+      - name: Update comfyui_version.py
+        run: |
+          # Read version from pyproject.toml and update comfyui_version.py
+          python -c '
+          import tomllib
+
+          # Read version from pyproject.toml
+          with open("pyproject.toml", "rb") as f:
+              config = tomllib.load(f)
+              version = config["project"]["version"]
+
+          # Write version to comfyui_version.py
+          with open("comfyui_version.py", "w") as f:
+              f.write("# This file is automatically generated by the build process when version is\n")
+              f.write("# updated in pyproject.toml.\n")
+              f.write(f"__version__ = \"{version}\"\n")
+          '
+
+      - name: Commit changes
+        run: |
+          git config --local user.name "github-actions"
+          git config --local user.email "github-actions@github.com"
+          git fetch origin ${{ github.head_ref }}
+          git checkout -B ${{ github.head_ref }} origin/${{ github.head_ref }}
+          git add comfyui_version.py
+          git diff --quiet && git diff --staged --quiet || git commit -m "chore: Update comfyui_version.py to match pyproject.toml"
+          git push origin HEAD:${{ github.head_ref }}
--- a/.github/workflows/windows_release_cu118_dependencies.yml
+++ b/.github/workflows/windows_release_cu118_dependencies.yml
@@ -1,71 +0,0 @@
-name: "Windows Release cu118 dependencies"
-
-on:
-  workflow_dispatch:
-#  push:
-#    branches:
-#      - master
-
-jobs:
-  build_dependencies:
-    env:
-        # you need at least cuda 5.0 for some of the stuff compiled here.
-        TORCH_CUDA_ARCH_LIST: "5.0+PTX 6.0 6.1 7.0 7.5 8.0 8.6 8.9"
-        FORCE_CUDA: 1
-        MAX_JOBS: 1 # will crash otherwise
-        DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc
-        XFORMERS_BUILD_TYPE: "Release"
-    runs-on: windows-latest
-    steps:
-        - name: Cache Built Dependencies
-          uses: actions/cache@v3
-          id: cache-cu118_python_stuff
-          with:
-            path: cu118_python_deps.tar
-            key: ${{ runner.os }}-build-cu118
-
-        - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true'
-          uses: actions/checkout@v3
-
-        - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true'
-          uses: actions/setup-python@v4
-          with:
-            python-version: '3.10.9'
-
-        - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true'
-          uses: comfyanonymous/cuda-toolkit@test
-          id: cuda-toolkit
-          with:
-            cuda: '11.8.0'
-        # copied from xformers github
-        - name: Setup MSVC
-          uses: ilammy/msvc-dev-cmd@v1
-        - name: Configure Pagefile
-          # windows runners will OOM with many CUDA architectures
-          # we cheat here with a page file
-          uses: al-cheb/configure-pagefile-action@v1.3
-          with:
-            minimum-size: 2GB
-        # really unfortunate: https://github.com/ilammy/msvc-dev-cmd#name-conflicts-with-shell-bash
-        - name: Remove link.exe
-          shell: bash
-          run: rm /usr/bin/link
-
-        - if: steps.cache-cu118_python_stuff.outputs.cache-hit != 'true'
-          shell: bash
-          run: |
-            python -m pip wheel --no-cache-dir torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 -r requirements.txt pygit2 -w ./temp_wheel_dir
-            python -m pip install --no-cache-dir ./temp_wheel_dir/*
-            echo installed basic
-            git clone --recurse-submodules https://github.com/facebookresearch/xformers.git
-            cd xformers
-            python -m pip install --no-cache-dir wheel setuptools twine
-            echo building xformers
-            python setup.py bdist_wheel -d ../temp_wheel_dir/
-            cd ..
-            rm -rf xformers
-            ls -lah temp_wheel_dir
-            mv temp_wheel_dir cu118_python_deps
-            tar cf cu118_python_deps.tar cu118_python_deps
-
-
--- a/.github/workflows/windows_release_cu118_dependencies_2.yml
+++ b/.github/workflows/windows_release_cu118_dependencies_2.yml
@@ -1,30 +0,0 @@
-name: "Windows Release cu118 dependencies 2"
-
-on:
-  workflow_dispatch:
-#  push:
-#    branches:
-#      - master
-
-jobs:
-  build_dependencies:
-    runs-on: windows-latest
-    steps:
-        - uses: actions/checkout@v3
-        - uses: actions/setup-python@v4
-          with:
-            python-version: '3.10.9'
-
-        - shell: bash
-          run: |
-            python -m pip wheel --no-cache-dir torch torchvision torchaudio xformers --extra-index-url https://download.pytorch.org/whl/cu118 -r requirements.txt pygit2 -w ./temp_wheel_dir
-            python -m pip install --no-cache-dir ./temp_wheel_dir/*
-            echo installed basic
-            ls -lah temp_wheel_dir
-            mv temp_wheel_dir cu118_python_deps
-            tar cf cu118_python_deps.tar cu118_python_deps
-
-        - uses: actions/cache/save@v3
-          with:
-            path: cu118_python_deps.tar
-            key: ${{ runner.os }}-build-cu118
--- a/.github/workflows/windows_release_cu118_package.yml
+++ b/.github/workflows/windows_release_cu118_package.yml
@@ -1,76 +0,0 @@
-name: "Windows Release cu118 packaging"
-
-on:
-  workflow_dispatch:
-#  push:
-#    branches:
-#      - master
-
-jobs:
-  package_comfyui:
-    permissions:
-        contents: "write"
-        packages: "write"
-        pull-requests: "read"
-    runs-on: windows-latest
-    steps:
-        - uses: actions/cache/restore@v3
-          id: cache
-          with:
-            path: cu118_python_deps.tar
-            key: ${{ runner.os }}-build-cu118
-        - shell: bash
-          run: |
-            mv cu118_python_deps.tar ../
-            cd ..
-            tar xf cu118_python_deps.tar
-            pwd
-            ls
-
-        - uses: actions/checkout@v3
-          with:
-            fetch-depth: 0
-        - shell: bash
-          run: |
-            cd ..
-            cp -r ComfyUI ComfyUI_copy
-            curl https://www.python.org/ftp/python/3.10.9/python-3.10.9-embed-amd64.zip -o python_embeded.zip
-            unzip python_embeded.zip -d python_embeded
-            cd python_embeded
-            echo 'import site' >> ./python310._pth
-            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
-            ./python.exe get-pip.py
-            ./python.exe -s -m pip install ../cu118_python_deps/*
-            sed -i '1i../ComfyUI' ./python310._pth
-            cd ..
-
-
-            mkdir ComfyUI_windows_portable
-            mv python_embeded ComfyUI_windows_portable
-            mv ComfyUI_copy ComfyUI_windows_portable/ComfyUI
-
-            cd ComfyUI_windows_portable
-
-            mkdir update
-            cp -r ComfyUI/.ci/update_windows/* ./update/
-            cp -r ComfyUI/.ci/update_windows_cu118/* ./update/
-            cp -r ComfyUI/.ci/windows_base_files/* ./
-
-            cd ..
-
-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma -mx=8 -mfb=64 -md=32m -ms=on ComfyUI_windows_portable.7z ComfyUI_windows_portable
-            mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu118_or_cpu.7z
-
-            cd ComfyUI_windows_portable
-            python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu
-
-            ls
-
-        - name: Upload binaries to release
-          uses: svenstaro/upload-release-action@v2
-          with:
-                repo_token: ${{ secrets.GITHUB_TOKEN }}
-                file: new_ComfyUI_windows_portable_nvidia_cu118_or_cpu.7z
-                tag: "latest"
-                overwrite: true
-
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -0,0 +1,71 @@
+name: "Windows Release dependencies"
+
+on:
+  workflow_dispatch:
+    inputs:
+      xformers:
+        description: 'xformers version'
+        required: false
+        type: string
+        default: ""
+      extra_dependencies:
+        description: 'extra dependencies'
+        required: false
+        type: string
+        default: ""
+      cu:
+        description: 'cuda version'
+        required: true
+        type: string
+        default: "128"
+
+      python_minor:
+        description: 'python minor version'
+        required: true
+        type: string
+        default: "12"
+
+      python_patch:
+        description: 'python patch version'
+        required: true
+        type: string
+        default: "10"
+#  push:
+#    branches:
+#      - master
+
+jobs:
+  build_dependencies:
+    runs-on: windows-latest
+    steps:
+        - uses: actions/checkout@v4
+        - uses: actions/setup-python@v5
+          with:
+            python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }}
+
+        - shell: bash
+          run: |
+            echo "@echo off
+            call update_comfyui.bat nopause
+            echo -
+            echo This will try to update pytorch and all python dependencies.
+            echo -
+            echo If you just want to update normally, close this and run update_comfyui.bat instead.
+            echo -
+            pause
+            ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
+            pause" > update_comfyui_and_python_dependencies.bat
+
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+            python -m pip install --no-cache-dir ./temp_wheel_dir/*
+            echo installed basic
+            ls -lah temp_wheel_dir
+            mv temp_wheel_dir cu${{ inputs.cu }}_python_deps
+            tar cf cu${{ inputs.cu }}_python_deps.tar cu${{ inputs.cu }}_python_deps
+
+        - uses: actions/cache/save@v4
+          with:
+            path: |
+              cu${{ inputs.cu }}_python_deps.tar
+              update_comfyui_and_python_dependencies.bat
+            key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -2,6 +2,24 @@ name: "Windows Release Nightly pytorch"

 on:
  workflow_dispatch:
+    inputs:
+      cu:
+        description: 'cuda version'
+        required: true
+        type: string
+        default: "128"
+
+      python_minor:
+        description: 'python minor version'
+        required: true
+        type: string
+        default: "13"
+
+      python_patch:
+        description: 'python patch version'
+        required: true
+        type: string
+        default: "2"
 #  push:
 #    branches:
 #      - master
@@ -14,28 +32,31 @@ jobs:
        pull-requests: "read"
    runs-on: windows-latest
    steps:
-        - uses: actions/checkout@v3
+        - uses: actions/checkout@v4
          with:
-            fetch-depth: 0
-        - uses: actions/setup-python@v4
+            fetch-depth: 30
+            persist-credentials: false
+        - uses: actions/setup-python@v5
          with:
-            python-version: '3.11.3'
+            python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }}
        - shell: bash
          run: |
            cd ..
            cp -r ComfyUI ComfyUI_copy
-            curl https://www.python.org/ftp/python/3.11.3/python-3.11.3-embed-amd64.zip -o python_embeded.zip
+            curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
            unzip python_embeded.zip -d python_embeded
            cd python_embeded
-            echo 'import site' >> ./python311._pth
+            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
            ./python.exe get-pip.py
-            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
            ls ../temp_wheel_dir
            ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
-            sed -i '1i../ComfyUI' ./python311._pth
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
            cd ..

+            git clone --depth 1 https://github.com/comfyanonymous/taesd
+            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/

            mkdir ComfyUI_windows_portable_nightly_pytorch
            mv python_embeded ComfyUI_windows_portable_nightly_pytorch
@@ -46,12 +67,14 @@ jobs:
            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
            cp -r ComfyUI/.ci/windows_base_files/* ./
-            cp -r ComfyUI/.ci/nightly/update_windows/* ./update/
-            cp -r ComfyUI/.ci/nightly/windows_base_files/* ./
+            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./

+            echo "call update_comfyui.bat nopause
+            ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
+            pause" > ./update/update_comfyui_and_python_dependencies.bat
            cd ..

-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma -mx=8 -mfb=64 -md=32m -ms=on ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
            mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z

            cd ComfyUI_windows_portable_nightly_pytorch
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -0,0 +1,102 @@
+name: "Windows Release packaging"
+
+on:
+  workflow_dispatch:
+    inputs:
+      cu:
+        description: 'cuda version'
+        required: true
+        type: string
+        default: "128"
+
+      python_minor:
+        description: 'python minor version'
+        required: true
+        type: string
+        default: "12"
+
+      python_patch:
+        description: 'python patch version'
+        required: true
+        type: string
+        default: "10"
+#  push:
+#    branches:
+#      - master
+
+jobs:
+  package_comfyui:
+    permissions:
+        contents: "write"
+        packages: "write"
+        pull-requests: "read"
+    runs-on: windows-latest
+    steps:
+        - uses: actions/cache/restore@v4
+          id: cache
+          with:
+            path: |
+              cu${{ inputs.cu }}_python_deps.tar
+              update_comfyui_and_python_dependencies.bat
+            key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }}
+        - shell: bash
+          run: |
+            mv cu${{ inputs.cu }}_python_deps.tar ../
+            mv update_comfyui_and_python_dependencies.bat ../
+            cd ..
+            tar xf cu${{ inputs.cu }}_python_deps.tar
+            pwd
+            ls
+
+        - uses: actions/checkout@v4
+          with:
+            fetch-depth: 150
+            persist-credentials: false
+        - shell: bash
+          run: |
+            cd ..
+            cp -r ComfyUI ComfyUI_copy
+            curl https://www.python.org/ftp/python/3.${{ inputs.python_minor }}.${{ inputs.python_patch }}/python-3.${{ inputs.python_minor }}.${{ inputs.python_patch }}-embed-amd64.zip -o python_embeded.zip
+            unzip python_embeded.zip -d python_embeded
+            cd python_embeded
+            echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
+            curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+            ./python.exe get-pip.py
+            ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/*
+            sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
+            cd ..
+
+            git clone --depth 1 https://github.com/comfyanonymous/taesd
+            cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/
+
+            mkdir ComfyUI_windows_portable
+            mv python_embeded ComfyUI_windows_portable
+            mv ComfyUI_copy ComfyUI_windows_portable/ComfyUI
+
+            cd ComfyUI_windows_portable
+
+            mkdir update
+            cp -r ComfyUI/.ci/update_windows/* ./update/
+            cp -r ComfyUI/.ci/windows_base_files/* ./
+            cp ../update_comfyui_and_python_dependencies.bat ./update/
+
+            cd ..
+
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
+            mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z
+
+            cd ComfyUI_windows_portable
+            python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu
+
+            python_embeded/python.exe -s ./update/update.py ComfyUI/
+
+            ls
+
+        - name: Upload binaries to release
+          uses: svenstaro/upload-release-action@v2
+          with:
+                repo_token: ${{ secrets.GITHUB_TOKEN }}
+                file: new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z
+                tag: "latest"
+                overwrite: true
+
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,26 @@
 __pycache__/
 *.py[cod]
-output/
-input/
-!input/example.png
-models/
-temp/
-custom_nodes/
+/output/
+/input/
+!/input/example.png
+/models/
+/temp/
+/custom_nodes/
 !custom_nodes/example_node.py.example
 extra_model_paths.yaml
 /.vs
+.vscode/
+.idea/
+venv/
+.venv/
+/web/extensions/*
+!/web/extensions/logging.js.example
+!/web/extensions/core/
+/tests-ui/data/object_info.json
+/user/
+*.log
+web_custom_versions/
+.DS_Store
+openapi.yaml
+filtered-openapi.yaml
+uv.lock
--- a/24
+++ b/24
@@ -0,0 +1,24 @@
+# Admins
+* @comfyanonymous
+
+# Note: Github teams syntax cannot be used here as the repo is not owned by Comfy-Org.
+# Inlined the team members for now.
+
+# Maintainers
+*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/requirements.txt @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/pyproject.toml @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+
+# Python web server
+/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
+
+# Node developers
+/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
+/comfy/comfy_types/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,41 @@
+# Contributing to ComfyUI
+
+Welcome, and thank you for your interest in contributing to ComfyUI!
+
+There are several ways in which you can contribute, beyond writing code. The goal of this document is to provide a high-level overview of how you can get involved.
+
+## Asking Questions
+
+Have a question? Instead of opening an issue, please ask on [Discord](https://comfy.org/discord) or [Matrix](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) channels. Our team and the community will help you.
+
+## Providing Feedback
+
+Your comments and feedback are welcome, and the development team is available via a handful of different channels.
+
+See the `#bug-report`, `#feature-request` and `#feedback` channels on Discord.
+
+## Reporting Issues
+
+Have you identified a reproducible problem in ComfyUI? Do you have a feature request? We want to hear about it! Here's how you can report your issue as effectively as possible.
+
+
+### Look For an Existing Issue
+
+Before you create a new issue, please do a search in [open issues](https://github.com/comfyanonymous/ComfyUI/issues) to see if the issue or feature request has already been filed.
+
+If you find your issue already exists, make relevant comments and add your [reaction](https://github.com/blog/2119-add-reactions-to-pull-requests-issues-and-comments). Use a reaction in place of a "+1" comment:
+
+* 👍 - upvote
+* 👎 - downvote
+
+If you cannot find an existing issue that describes your bug or feature, create a new issue. We have an issue template in place to organize new issues.
+
+
+### Creating Pull Requests
+
+* Please refer to the article on [creating pull requests](https://github.com/comfyanonymous/ComfyUI/wiki/How-to-Contribute-Code) and contributing to this project.
+
+
+## Thank You
+
+Your contributions to open source, large or small, make great projects like this possible. Thank you for taking the time to contribute.
--- a/README.md
+++ b/README.md
@@ -1,26 +1,88 @@
-ComfyUI
-=======
-A powerful and modular stable diffusion GUI and backend.
-----------
-![ComfyUI Screenshot](comfyui_screenshot.png)
+<div align="center">

-This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
-### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
+# ComfyUI
+**The most powerful and modular visual AI engine and application.**

-### [Installing ComfyUI](#installing)
+
+[![Website][website-shield]][website-url]
+[![Dynamic JSON Badge][discord-shield]][discord-url]
+[![Matrix][matrix-shield]][matrix-url]
+<br>
+[![][github-release-shield]][github-release-link]
+[![][github-release-date-shield]][github-release-link]
+[![][github-downloads-shield]][github-downloads-link]
+[![][github-downloads-latest-shield]][github-downloads-link]
+
+[matrix-shield]: https://img.shields.io/badge/Matrix-000000?style=flat&logo=matrix&logoColor=white
+[matrix-url]: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
+[website-shield]: https://img.shields.io/badge/ComfyOrg-4285F4?style=flat
+[website-url]: https://www.comfy.org/
+<!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
+[discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
+[discord-url]: https://www.comfy.org/discord
+
+[github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
+[github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
+[github-release-date-shield]: https://img.shields.io/github/release-date/comfyanonymous/ComfyUI?style=flat
+[github-downloads-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/total?style=flat
+[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
+[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
+
+![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
+</div>
+
+ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
+
+## Get Started
+
+#### [Desktop Application](https://www.comfy.org/download)
+- The easiest way to get started. 
+- Available on Windows & macOS.
+
+#### [Windows Portable Package](#installing)
+- Get the latest commits and completely portable.
+- Available on Windows.
+
+#### [Manual Install](#manual-install-windows-linux)
+Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).
+
+## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
+See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).

 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
- Fully supports SD1.x and SD2.x
+- Image Models
+   - SD1.x, SD2.x,
+   - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
+   - [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
+   - [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/)
+   - Pixart Alpha and Sigma
+   - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
+   - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
+   - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
+   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
+   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
+- Video Models
+   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
+   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
+   - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
+   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
+   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/)
+   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
+- Audio Models
+   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
+   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
+- 3D Models
+   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
- Command line option: ```--lowvram``` to make it work on GPUs with less than 3GB vram (enabled automatically on GPUs with low vram)
+- Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
 - Works even if you don't have a GPU with: ```--cpu``` (slow)
 - Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs and CLIP models.
 - Embeddings/Textual inversion
 - [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/)
 - [Hypernetworks](https://comfyanonymous.github.io/ComfyUI_examples/hypernetworks/)
- Loading full workflows (with seeds) from generated PNG files.
+- Loading full workflows (with seeds) from generated PNG, WebP and FLAC files.
 - Saving/Loading workflows as Json files.
 - Nodes interface can be used to create complex workflows like one for [Hires fix](https://comfyanonymous.github.io/ComfyUI_examples/2_pass_txt2img/) or much more advanced ones.
 - [Area Composition](https://comfyanonymous.github.io/ComfyUI_examples/area_composition/)
@@ -29,76 +91,151 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - [Upscale Models (ESRGAN, ESRGAN variants, SwinIR, Swin2SR, etc...)](https://comfyanonymous.github.io/ComfyUI_examples/upscale_models/)
 - [unCLIP Models](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)
 - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
+- [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
+- [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
+- Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
 - [Config file](extra_model_paths.yaml.example) to set the search paths for models.

 Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)

+## Release Process
+
+ComfyUI follows a weekly release cycle every Friday, with three interconnected repositories:
+
+1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
+   - Releases a new stable version (e.g., v0.7.0)
+   - Serves as the foundation for the desktop release
+
+2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
+   - Builds a new release using the latest stable core version
+
+3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
+   - Weekly frontend updates are merged into the core repository
+   - Features are frozen for the upcoming core release
+   - Development continues for the next release cycle
+
 ## Shortcuts

-| Keybind | Explanation |
-| - | - |
-| Ctrl + Enter | Queue up current graph for generation |
-| Ctrl + Shift + Enter | Queue up current graph as first for generation |
-| Ctrl + S | Save workflow |
-| Ctrl + O | Load workflow |
-| Ctrl + A | Select all nodes |
-| Ctrl + M | Mute/unmute selected nodes |
-| Delete/Backspace | Delete selected nodes |
-| Ctrl + Delete/Backspace | Delete the current graph |
-| Space | Move the canvas around when held and moving the cursor |
-| Ctrl/Shift + Click | Add clicked node to selection |
-| Ctrl + C/Ctrl + V | Copy and paste selected nodes (without maintaining connections to outputs of unselected nodes) |
-| Ctrl + C/Ctrl + Shift + V| Copy and paste selected nodes (maintaining connections from outputs of unselected nodes to inputs of pasted nodes) |
-| Shift + Drag | Move multiple selected nodes at the same time |
-| Ctrl + D | Load default graph |
-| Q | Toggle visibility of the queue |
-| H | Toggle visibility of history |
-| R | Refresh graph |
-| Double-Click LMB | Open node quick search palette |
+| Keybind                            | Explanation                                                                                                        |
+|------------------------------------|--------------------------------------------------------------------------------------------------------------------|
+| `Ctrl` + `Enter`                      | Queue up current graph for generation                                                                              |
+| `Ctrl` + `Shift` + `Enter`              | Queue up current graph as first for generation                                                                     |
+| `Ctrl` + `Alt` + `Enter`                | Cancel current generation                                                                                          |
+| `Ctrl` + `Z`/`Ctrl` + `Y`                 | Undo/Redo                                                                                                          |
+| `Ctrl` + `S`                          | Save workflow                                                                                                      |
+| `Ctrl` + `O`                          | Load workflow                                                                                                      |
+| `Ctrl` + `A`                          | Select all nodes                                                                                                   |
+| `Alt `+ `C`                           | Collapse/uncollapse selected nodes                                                                                 |
+| `Ctrl` + `M`                          | Mute/unmute selected nodes                                                                                         |
+| `Ctrl` + `B`                           | Bypass selected nodes (acts like the node was removed from the graph and the wires reconnected through)            |
+| `Delete`/`Backspace`                   | Delete selected nodes                                                                                              |
+| `Ctrl` + `Backspace`                   | Delete the current graph                                                                                           |
+| `Space`                              | Move the canvas around when held and moving the cursor                                                             |
+| `Ctrl`/`Shift` + `Click`                 | Add clicked node to selection                                                                                      |
+| `Ctrl` + `C`/`Ctrl` + `V`                  | Copy and paste selected nodes (without maintaining connections to outputs of unselected nodes)                     |
+| `Ctrl` + `C`/`Ctrl` + `Shift` + `V`          | Copy and paste selected nodes (maintaining connections from outputs of unselected nodes to inputs of pasted nodes) |
+| `Shift` + `Drag`                       | Move multiple selected nodes at the same time                                                                      |
+| `Ctrl` + `D`                           | Load default graph                                                                                                 |
+| `Alt` + `+`                          | Canvas Zoom in                                                                                                     |
+| `Alt` + `-`                          | Canvas Zoom out                                                                                                    |
+| `Ctrl` + `Shift` + LMB + Vertical drag | Canvas Zoom in/out                                                                                                 |
+| `P`                                  | Pin/Unpin selected nodes                                                                                           |
+| `Ctrl` + `G`                           | Group selected nodes                                                                                               |
+| `Q`                                 | Toggle visibility of the queue                                                                                     |
+| `H`                                  | Toggle visibility of history                                                                                       |
+| `R`                                  | Refresh graph                                                                                                      |
+| `F`                                  | Show/Hide menu                                                                                                      |
+| `.`                                  | Fit view to selection (Whole graph when nothing is selected)                                                        |
+| Double-Click LMB                   | Open node quick search palette                                                                                     |
+| `Shift` + Drag                       | Move multiple wires at once                                                                                        |
+| `Ctrl` + `Alt` + LMB                   | Disconnect all wires from clicked slot                                                                             |

-Ctrl can also be replaced with Cmd instead for MacOS users
+`Ctrl` can also be replaced with `Cmd` instead for macOS users

 # Installing

-## Windows
+## Windows Portable

 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

-### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu118_or_cpu.7z)
+### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)

-Just download, extract and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
+Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
+
+If you have trouble extracting it, right click the file -> properties -> unblock

 #### How do I share models between another UI and ComfyUI?

 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.

-## Colab Notebook
+## Jupyter Notebook

-To run it on colab or paperspace you can use my [Colab Notebook](notebooks/comfyui_colab.ipynb) here: [Link to open with google colab](https://colab.research.google.com/github/comfyanonymous/ComfyUI/blob/master/notebooks/comfyui_colab.ipynb)
+To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)
+
+
+## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)
+
+You can install and start ComfyUI using comfy-cli:
+```bash
+pip install comfy-cli
+comfy install
+```

 ## Manual Install (Windows, Linux)

+python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet.
+
 Git clone this repo.

 Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints

 Put your VAE in: models/vae

-At the time of writing this pytorch has issues with python versions higher than 3.10 so make sure your python/pip versions are 3.10.

 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.4.2```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3```

+This is the command to install the nightly with ROCm 6.4 which might have some performance improvements:
+
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4```
+
+### Intel GPUs (Windows and Linux)
+
+(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip (currently available in PyTorch nightly builds). More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
+  
+1. To install PyTorch nightly, use the following command:
+
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```
+
+2. Launch ComfyUI by running `python main.py`
+
+
+(Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance.
+
+1. For Intel® Arc™ A-Series Graphics utilizing IPEX, create a conda environment and use the commands below:
+
+```
+conda install libuv
+pip install torch==2.3.1.post0+cxx11.abi torchvision==0.18.1.post0+cxx11.abi torchaudio==2.3.1.post0+cxx11.abi intel-extension-for-pytorch==2.3.110.post0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+```
+
+For other supported Intel GPUs with IPEX, visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
+
+Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476).

 ### NVIDIA

-Nvidia users should install torch and xformers using this command:
+Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 xformers```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128```
+
+This is the command to install pytorch nightly instead which might have performance improvements.
+
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128```

 #### Troubleshooting

@@ -118,33 +255,57 @@ After this you should have everything installed and can proceed to running Comfy

 ### Others:

-[Intel Arc](https://github.com/comfyanonymous/ComfyUI/discussions/476)
+#### Apple Mac silicon

-Mac/MPS: There is basic support in the code but until someone makes some install instruction you are on your own.
+You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS version.

-### I already have another UI for Stable Diffusion installed do I really have to install all of these dependencies?
+1. Install pytorch nightly. For instructions, read the [Accelerated PyTorch training on Mac](https://developer.apple.com/metal/pytorch/) Apple Developer guide (make sure to install the latest pytorch nightly).
+1. Follow the [ComfyUI manual installation](#manual-install-windows-linux) instructions for Windows and Linux.
+1. Install the ComfyUI [dependencies](#dependencies). If you have another Stable Diffusion UI [you might be able to reuse the dependencies](#i-already-have-another-ui-for-stable-diffusion-installed-do-i-really-have-to-install-all-of-these-dependencies).
+1. Launch ComfyUI by running `python main.py`

-You don't. If you have another UI installed and working with it's own python venv you can use that venv to run ComfyUI. You can open up your favorite terminal and activate it:
+> **Note**: Remember to add your models, VAE, LoRAs etc. to the corresponding Comfy folders, as discussed in [ComfyUI manual installation](#manual-install-windows-linux).

-```source path_to_other_sd_gui/venv/bin/activate```
+#### DirectML (AMD Cards on Windows)

-or on Windows:
+```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

-With Powershell: ```"path_to_other_sd_gui\venv\Scripts\Activate.ps1"```
+#### Ascend NPUs

-With cmd.exe: ```"path_to_other_sd_gui\venv\Scripts\activate.bat"```
+For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:

-And then you can use that terminal to run Comfyui without installing any dependencies. Note that the venv folder might be called something else depending on the SD UI.
+1. Begin by installing the recommended or newer kernel version for Linux as specified in the Installation page of torch-npu, if necessary.
+2. Proceed with the installation of Ascend Basekit, which includes the driver, firmware, and CANN, following the instructions provided for your specific platform.
+3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
+4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.
+
+#### Cambricon MLUs
+
+For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a step-by-step guide tailored to your platform and installation method:
+
+1. Install the Cambricon CNToolkit by adhering to the platform-specific instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cntoolkit_3.7.2/cntoolkit_install_3.7.2/index.html)
+2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
+3. Launch ComfyUI by running `python main.py`

 # Running

 ```python main.py```

-### For AMD 6700, 6600 and maybe others
+### For AMD cards not officially supported by ROCm

 Try running it with this command if you have issues:

-```HSA_OVERRIDE_GFX_VERSION=10.3.0 python main.py```
+For 6700, 6600 and maybe other RDNA2 or older: ```HSA_OVERRIDE_GFX_VERSION=10.3.0 python main.py```
+
+For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 python main.py```
+
+### AMD ROCm Tips
+
+You can enable experimental memory efficient attention on recent pytorch in ComfyUI on some AMD GPUs using this command, it should already be enabled by default on RDNA3. If this improves speed for you on latest pytorch on your GPU please report it so that I can enable it by default.
+
+```TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 python main.py --use-pytorch-cross-attention```
+
+You can also try setting this env variable `PYTORCH_TUNABLEOP_ENABLED=1` which might speed things up at the cost of a very slow initial run.

 # Notes

@@ -158,39 +319,78 @@ You can use () to change emphasis of a word or phrase like: (good code:1.2) or (

 You can use {day|night}, for wildcard/dynamic prompts. With this syntax "{wild|card|test}" will be randomly replaced by either "wild", "card" or "test" by the frontend every time you queue the prompt. To use {} characters in your actual prompt escape them like: \\{ or \\}.

+Dynamic prompts also support C-style comments, like `// comment` or `/* comment */`.
+
 To use a textual inversion concepts/embeddings in a text prompt put them in the models/embeddings directory and use them in the CLIPTextEncode node like this (you can omit the .pt extension):

 ```embedding:embedding_filename.pt```

-### Fedora

-To get python 3.10 on fedora:
-```dnf install python3.10```
+## How to show high-quality previews?

-Then you can:
+Use ```--preview-method auto``` to enable previews.

-```python3.10 -m ensurepip```
+The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth, taesdxl_decoder.pth, taesd3_decoder.pth and taef1_decoder.pth](https://github.com/madebyollin/taesd/) and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI and launch it with `--preview-method taesd` to enable high-quality previews.

-This will let you use: pip3.10 to install all the dependencies.
+## How to use TLS/SSL?
+Generate a self-signed certificate (not appropriate for shared/production use) and key by running the command: `openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname"`

-## How to increase generation speed?
+Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app will now be accessible with `https://...` instead of `http://...`.

-Make sure you use the regular loaders/Load Checkpoint node to load checkpoints. It will auto pick the right settings depending on your GPU.
-
-You can set this command line setting to disable the upcasting to fp32 in some cross attention operations which will increase your speed. Note that this will very likely give you black images on SD2.x models. If you use xformers this option does not do anything.
-
-```--dont-upcast-attention```
+> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above. 
+<br/><br/>If you use a container, note that the volume mount `-v` can be a relative path so `... -v ".\:/openssl-certs" ...` would create the key & cert files in the current directory of your command prompt or powershell terminal.

 ## Support and dev channel

+[Discord](https://comfy.org/discord): Try the #help or #feedback channels.
+
 [Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source).

+See also: [https://www.comfy.org/](https://www.comfy.org/)
+
+## Frontend Development
+
+As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
+
+### Reporting Issues and Requesting Features
+
+For any bugs, issues, or feature requests related to the frontend, please use the [ComfyUI Frontend repository](https://github.com/Comfy-Org/ComfyUI_frontend). This will help us manage and address frontend-specific concerns more efficiently.
+
+### Using the Latest Frontend
+
+The new frontend is now the default for ComfyUI. However, please note:
+
+1. The frontend in the main ComfyUI repository is updated fortnightly.
+2. Daily releases are available in the separate frontend repository.
+
+To use the most up-to-date frontend version:
+
+1. For the latest daily release, launch ComfyUI with this command line argument:
+
+   ```
+   --front-end-version Comfy-Org/ComfyUI_frontend@latest
+   ```
+
+2. For a specific version, replace `latest` with the desired version number:
+
+   ```
+   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
+   ```
+
+This approach allows you to easily switch between the stable fortnightly release and the cutting-edge daily updates, or even specific versions for testing purposes.
+
+### Accessing the Legacy Frontend
+
+If you need to use the legacy frontend for any reason, you can access it using the following command line argument:
+
+```
+--front-end-version Comfy-Org/ComfyUI_legacy_frontend@latest
+```
+
+This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy Frontend repository](https://github.com/Comfy-Org/ComfyUI_legacy_frontend).
+
 # QA

-### Why did you make this?
+### Which GPU should I buy for this?

-I wanted to learn how Stable Diffusion worked in detail. I also wanted something clean and powerful that would let me experiment with SD without restrictions.
-
-### Who is this for?
-
-This is for anyone that wants to make complex workflows with SD or that wants to learn more how SD works. The interface follows closely how SD works and the code should be much more simple to understand than other SD UIs.
+[See this page for some recommendations](https://github.com/comfyanonymous/ComfyUI/wiki/Which-GPU-should-I-buy-for-ComfyUI)
--- a/comfy/ldm/data/init.py
+++ b/comfy/ldm/data/init.py
--- a/comfy/ldm/models/diffusion/init.py
+++ b/comfy/ldm/models/diffusion/init.py
--- a/api_server/routes/internal/README.md
+++ b/api_server/routes/internal/README.md
@@ -0,0 +1,3 @@
+# ComfyUI Internal Routes
+
+All routes under the `/internal` path are designated for **internal use by ComfyUI only**. These routes are not intended for use by external applications may change at any time without notice.
--- a/api_server/routes/internal/init.py
+++ b/api_server/routes/internal/init.py
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@@ -0,0 +1,73 @@
+from aiohttp import web
+from typing import Optional
+from folder_paths import folder_names_and_paths, get_directory_by_type
+from api_server.services.terminal_service import TerminalService
+import app.logger
+import os
+
+class InternalRoutes:
+    '''
+    The top level web router for internal routes: /internal/*
+    The endpoints here should NOT be depended upon. It is for ComfyUI frontend use only.
+    Check README.md for more information.
+    '''
+
+    def __init__(self, prompt_server):
+        self.routes: web.RouteTableDef = web.RouteTableDef()
+        self._app: Optional[web.Application] = None
+        self.prompt_server = prompt_server
+        self.terminal_service = TerminalService(prompt_server)
+
+    def setup_routes(self):
+        @self.routes.get('/logs')
+        async def get_logs(request):
+            return web.json_response("".join([(l["t"] + " - " + l["m"]) for l in app.logger.get_logs()]))
+
+        @self.routes.get('/logs/raw')
+        async def get_raw_logs(request):
+            self.terminal_service.update_size()
+            return web.json_response({
+                "entries": list(app.logger.get_logs()),
+                "size": {"cols": self.terminal_service.cols, "rows": self.terminal_service.rows}
+            })
+
+        @self.routes.patch('/logs/subscribe')
+        async def subscribe_logs(request):
+            json_data = await request.json()
+            client_id = json_data["clientId"]
+            enabled = json_data["enabled"]
+            if enabled:
+                self.terminal_service.subscribe(client_id)
+            else:
+                self.terminal_service.unsubscribe(client_id)
+
+            return web.Response(status=200)
+
+
+        @self.routes.get('/folder_paths')
+        async def get_folder_paths(request):
+            response = {}
+            for key in folder_names_and_paths:
+                response[key] = folder_names_and_paths[key][0]
+            return web.json_response(response)
+
+        @self.routes.get('/files/{directory_type}')
+        async def get_files(request: web.Request) -> web.Response:
+            directory_type = request.match_info['directory_type']
+            if directory_type not in ("output", "input", "temp"):
+                return web.json_response({"error": "Invalid directory type"}, status=400)
+
+            directory = get_directory_by_type(directory_type)
+            sorted_files = sorted(
+                (entry for entry in os.scandir(directory) if entry.is_file()),
+                key=lambda entry: -entry.stat().st_mtime
+            )
+            return web.json_response([entry.name for entry in sorted_files], status=200)
+
+
+    def get_app(self):
+        if self._app is None:
+            self._app = web.Application()
+            self.setup_routes()
+            self._app.add_routes(self.routes)
+        return self._app
--- a/comfy/ldm/modules/midas/midas/init.py
+++ b/comfy/ldm/modules/midas/midas/init.py
--- a/api_server/services/terminal_service.py
+++ b/api_server/services/terminal_service.py
@@ -0,0 +1,60 @@
+from app.logger import on_flush
+import os
+import shutil
+
+
+class TerminalService:
+    def __init__(self, server):
+        self.server = server
+        self.cols = None
+        self.rows = None
+        self.subscriptions = set()
+        on_flush(self.send_messages)
+
+    def get_terminal_size(self):
+        try:
+            size = os.get_terminal_size()
+            return (size.columns, size.lines)
+        except OSError:
+            try:
+                size = shutil.get_terminal_size()
+                return (size.columns, size.lines)
+            except OSError:
+                return (80, 24)  # fallback to 80x24
+
+    def update_size(self):
+        columns, lines = self.get_terminal_size()
+        changed = False
+
+        if columns != self.cols:
+            self.cols = columns
+            changed = True
+
+        if lines != self.rows:
+            self.rows = lines
+            changed = True
+
+        if changed:
+            return {"cols": self.cols, "rows": self.rows}
+
+        return None
+
+    def subscribe(self, client_id):
+        self.subscriptions.add(client_id)
+
+    def unsubscribe(self, client_id):
+        self.subscriptions.discard(client_id)
+
+    def send_messages(self, entries):
+        if not len(entries) or not len(self.subscriptions):
+            return
+
+        new_size = self.update_size()
+
+        for client_id in self.subscriptions.copy(): # prevent: Set changed size during iteration
+            if client_id not in self.server.sockets:
+                # Automatically unsub if the socket has disconnected
+                self.unsubscribe(client_id)
+                continue
+
+            self.server.send_sync("logs", {"entries": entries, "size": new_size}, client_id)
--- a/api_server/utils/file_operations.py
+++ b/api_server/utils/file_operations.py
@@ -0,0 +1,42 @@
+import os
+from typing import List, Union, TypedDict, Literal
+from typing_extensions import TypeGuard
+class FileInfo(TypedDict):
+    name: str
+    path: str
+    type: Literal["file"]
+    size: int
+
+class DirectoryInfo(TypedDict):
+    name: str
+    path: str
+    type: Literal["directory"]
+
+FileSystemItem = Union[FileInfo, DirectoryInfo]
+
+def is_file_info(item: FileSystemItem) -> TypeGuard[FileInfo]:
+    return item["type"] == "file"
+
+class FileSystemOperations:
+    @staticmethod
+    def walk_directory(directory: str) -> List[FileSystemItem]:
+        file_list: List[FileSystemItem] = []
+        for root, dirs, files in os.walk(directory):
+            for name in files:
+                file_path = os.path.join(root, name)
+                relative_path = os.path.relpath(file_path, directory)
+                file_list.append({
+                    "name": name,
+                    "path": relative_path,
+                    "type": "file",
+                    "size": os.path.getsize(file_path)
+                })
+            for name in dirs:
+                dir_path = os.path.join(root, name)
+                relative_path = os.path.relpath(dir_path, directory)
+                file_list.append({
+                    "name": name,
+                    "path": relative_path,
+                    "type": "directory"
+                })
+        return file_list
--- a/comfy_extras/chainner_models/init.py
+++ b/comfy_extras/chainner_models/init.py
--- a/app/app_settings.py
+++ b/app/app_settings.py
@@ -0,0 +1,65 @@
+import os
+import json
+from aiohttp import web
+import logging
+
+
+class AppSettings():
+    def __init__(self, user_manager):
+        self.user_manager = user_manager
+
+    def get_settings(self, request):
+        try:
+            file = self.user_manager.get_request_user_filepath(
+                request,
+                "comfy.settings.json"
+            )
+        except KeyError as e:
+            logging.error("User settings not found.")
+            raise web.HTTPUnauthorized() from e
+        if os.path.isfile(file):
+            try:
+                with open(file) as f:
+                    return json.load(f)
+            except:
+                logging.error(f"The user settings file is corrupted: {file}")
+                return {}
+        else:
+            return {}
+
+    def save_settings(self, request, settings):
+        file = self.user_manager.get_request_user_filepath(
+            request, "comfy.settings.json")
+        with open(file, "w") as f:
+            f.write(json.dumps(settings, indent=4))
+
+    def add_routes(self, routes):
+        @routes.get("/settings")
+        async def get_settings(request):
+            return web.json_response(self.get_settings(request))
+
+        @routes.get("/settings/{id}")
+        async def get_setting(request):
+            value = None
+            settings = self.get_settings(request)
+            setting_id = request.match_info.get("id", None)
+            if setting_id and setting_id in settings:
+                value = settings[setting_id]
+            return web.json_response(value)
+
+        @routes.post("/settings")
+        async def post_settings(request):
+            settings = self.get_settings(request)
+            new_settings = await request.json()
+            self.save_settings(request, {**settings, **new_settings})
+            return web.Response(status=200)
+
+        @routes.post("/settings/{id}")
+        async def post_setting(request):
+            setting_id = request.match_info.get("id", None)
+            if not setting_id:
+                return web.Response(status=400)
+            settings = self.get_settings(request)
+            settings[setting_id] = await request.json()
+            self.save_settings(request, settings)
+            return web.Response(status=200)
--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+import os
+import folder_paths
+import glob
+from aiohttp import web
+import json
+import logging
+from functools import lru_cache
+
+from utils.json_util import merge_json_recursive
+
+
+# Extra locale files to load into main.json
+EXTRA_LOCALE_FILES = [
+    "nodeDefs.json",
+    "commands.json",
+    "settings.json",
+]
+
+
+def safe_load_json_file(file_path: str) -> dict:
+    if not os.path.exists(file_path):
+        return {}
+
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except json.JSONDecodeError:
+        logging.error(f"Error loading {file_path}")
+        return {}
+
+
+class CustomNodeManager:
+    @lru_cache(maxsize=1)
+    def build_translations(self):
+        """Load all custom nodes translations during initialization. Translations are
+        expected to be loaded from `locales/` folder.
+
+        The folder structure is expected to be the following:
+        - custom_nodes/
+            - custom_node_1/
+                - locales/
+                    - en/
+                        - main.json
+                        - commands.json
+                        - settings.json
+
+        returned translations are expected to be in the following format:
+        {
+            "en": {
+                "nodeDefs": {...},
+                "commands": {...},
+                "settings": {...},
+                ...{other main.json keys}
+            }
+        }
+        """
+
+        translations = {}
+
+        for folder in folder_paths.get_folder_paths("custom_nodes"):
+            # Sort glob results for deterministic ordering
+            for custom_node_dir in sorted(glob.glob(os.path.join(folder, "*/"))):
+                locales_dir = os.path.join(custom_node_dir, "locales")
+                if not os.path.exists(locales_dir):
+                    continue
+
+                for lang_dir in glob.glob(os.path.join(locales_dir, "*/")):
+                    lang_code = os.path.basename(os.path.dirname(lang_dir))
+
+                    if lang_code not in translations:
+                        translations[lang_code] = {}
+
+                    # Load main.json
+                    main_file = os.path.join(lang_dir, "main.json")
+                    node_translations = safe_load_json_file(main_file)
+
+                    # Load extra locale files
+                    for extra_file in EXTRA_LOCALE_FILES:
+                        extra_file_path = os.path.join(lang_dir, extra_file)
+                        key = extra_file.split(".")[0]
+                        json_data = safe_load_json_file(extra_file_path)
+                        if json_data:
+                            node_translations[key] = json_data
+
+                    if node_translations:
+                        translations[lang_code] = merge_json_recursive(
+                            translations[lang_code], node_translations
+                        )
+
+        return translations
+
+    def add_routes(self, routes, webapp, loadedModules):
+
+        example_workflow_folder_names = ["example_workflows", "example", "examples", "workflow", "workflows"]
+
+        @routes.get("/workflow_templates")
+        async def get_workflow_templates(request):
+            """Returns a web response that contains the map of custom_nodes names and their associated workflow templates. The ones without templates are omitted."""
+
+            files = []
+
+            for folder in folder_paths.get_folder_paths("custom_nodes"):
+                for folder_name in example_workflow_folder_names:
+                    pattern = os.path.join(folder, f"*/{folder_name}/*.json")
+                    matched_files = glob.glob(pattern)
+                    files.extend(matched_files)
+
+            workflow_templates_dict = (
+                {}
+            )  # custom_nodes folder name -> example workflow names
+            for file in files:
+                custom_nodes_name = os.path.basename(
+                    os.path.dirname(os.path.dirname(file))
+                )
+                workflow_name = os.path.splitext(os.path.basename(file))[0]
+                workflow_templates_dict.setdefault(custom_nodes_name, []).append(
+                    workflow_name
+                )
+            return web.json_response(workflow_templates_dict)
+
+        # Serve workflow templates from custom nodes.
+        for module_name, module_dir in loadedModules:
+            for folder_name in example_workflow_folder_names:
+                workflows_dir = os.path.join(module_dir, folder_name)
+
+                if os.path.exists(workflows_dir):
+                    if folder_name != "example_workflows":
+                        logging.debug(
+                            "Found example workflow folder '%s' for custom node '%s', consider renaming it to 'example_workflows'",
+                            folder_name, module_name)
+
+                    webapp.add_routes(
+                        [
+                            web.static(
+                                "/api/workflow_templates/" + module_name, workflows_dir
+                            )
+                        ]
+                    )
+
+        @routes.get("/i18n")
+        async def get_i18n(request):
+            """Returns translations from all custom nodes' locales folders."""
+            return web.json_response(self.build_translations())
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -0,0 +1,309 @@
+from __future__ import annotations
+import argparse
+import logging
+import os
+import re
+import sys
+import tempfile
+import zipfile
+import importlib
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from typing import TypedDict, Optional
+from importlib.metadata import version
+
+import requests
+from typing_extensions import NotRequired
+
+from comfy.cli_args import DEFAULT_VERSION_STRING
+import app.logger
+
+# The path to the requirements.txt file
+req_path = Path(__file__).parents[1] / "requirements.txt"
+
+
+def frontend_install_warning_message():
+    """The warning message to display when the frontend version is not up to date."""
+
+    extra = ""
+    if sys.flags.no_user_site:
+        extra = "-s "
+    return f"""
+Please install the updated requirements.txt file by running:
+{sys.executable} {extra}-m pip install -r {req_path}
+
+This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
+
+If you are on the portable package you can run: update\\update_comfyui.bat to solve this problem
+""".strip()
+
+
+def check_frontend_version():
+    """Check if the frontend version is up to date."""
+
+    def parse_version(version: str) -> tuple[int, int, int]:
+        return tuple(map(int, version.split(".")))
+
+    try:
+        frontend_version_str = version("comfyui-frontend-package")
+        frontend_version = parse_version(frontend_version_str)
+        with open(req_path, "r", encoding="utf-8") as f:
+            required_frontend = parse_version(f.readline().split("=")[-1])
+        if frontend_version < required_frontend:
+            app.logger.log_startup_warning(
+                f"""
+________________________________________________________________________
+WARNING WARNING WARNING WARNING WARNING
+
+Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
+
+{frontend_install_warning_message()}
+________________________________________________________________________
+""".strip()
+            )
+        else:
+            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
+    except Exception as e:
+        logging.error(f"Failed to check frontend version: {e}")
+
+
+REQUEST_TIMEOUT = 10  # seconds
+
+
+class Asset(TypedDict):
+    url: str
+
+
+class Release(TypedDict):
+    id: int
+    tag_name: str
+    name: str
+    prerelease: bool
+    created_at: str
+    published_at: str
+    body: str
+    assets: NotRequired[list[Asset]]
+
+
+@dataclass
+class FrontEndProvider:
+    owner: str
+    repo: str
+
+    @property
+    def folder_name(self) -> str:
+        return f"{self.owner}_{self.repo}"
+
+    @property
+    def release_url(self) -> str:
+        return f"https://api.github.com/repos/{self.owner}/{self.repo}/releases"
+
+    @cached_property
+    def all_releases(self) -> list[Release]:
+        releases = []
+        api_url = self.release_url
+        while api_url:
+            response = requests.get(api_url, timeout=REQUEST_TIMEOUT)
+            response.raise_for_status()  # Raises an HTTPError if the response was an error
+            releases.extend(response.json())
+            # GitHub uses the Link header to provide pagination links. Check if it exists and update api_url accordingly.
+            if "next" in response.links:
+                api_url = response.links["next"]["url"]
+            else:
+                api_url = None
+        return releases
+
+    @cached_property
+    def latest_release(self) -> Release:
+        latest_release_url = f"{self.release_url}/latest"
+        response = requests.get(latest_release_url, timeout=REQUEST_TIMEOUT)
+        response.raise_for_status()  # Raises an HTTPError if the response was an error
+        return response.json()
+
+    def get_release(self, version: str) -> Release:
+        if version == "latest":
+            return self.latest_release
+        else:
+            for release in self.all_releases:
+                if release["tag_name"] in [version, f"v{version}"]:
+                    return release
+            raise ValueError(f"Version {version} not found in releases")
+
+
+def download_release_asset_zip(release: Release, destination_path: str) -> None:
+    """Download dist.zip from github release."""
+    asset_url = None
+    for asset in release.get("assets", []):
+        if asset["name"] == "dist.zip":
+            asset_url = asset["url"]
+            break
+
+    if not asset_url:
+        raise ValueError("dist.zip not found in the release assets")
+
+    # Use a temporary file to download the zip content
+    with tempfile.TemporaryFile() as tmp_file:
+        headers = {"Accept": "application/octet-stream"}
+        response = requests.get(
+            asset_url, headers=headers, allow_redirects=True, timeout=REQUEST_TIMEOUT
+        )
+        response.raise_for_status()  # Ensure we got a successful response
+
+        # Write the content to the temporary file
+        tmp_file.write(response.content)
+
+        # Go back to the beginning of the temporary file
+        tmp_file.seek(0)
+
+        # Extract the zip file content to the destination path
+        with zipfile.ZipFile(tmp_file, "r") as zip_ref:
+            zip_ref.extractall(destination_path)
+
+
+class FrontendManager:
+    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
+
+    @classmethod
+    def default_frontend_path(cls) -> str:
+        try:
+            import comfyui_frontend_package
+
+            return str(importlib.resources.files(comfyui_frontend_package) / "static")
+        except ImportError:
+            logging.error(
+                f"""
+********** ERROR ***********
+
+comfyui-frontend-package is not installed.
+
+{frontend_install_warning_message()}
+
+********** ERROR ***********
+""".strip()
+            )
+            sys.exit(-1)
+
+    @classmethod
+    def templates_path(cls) -> str:
+        try:
+            import comfyui_workflow_templates
+
+            return str(
+                importlib.resources.files(comfyui_workflow_templates) / "templates"
+            )
+        except ImportError:
+            logging.error(
+                f"""
+********** ERROR ***********
+
+comfyui-workflow-templates is not installed.
+
+{frontend_install_warning_message()}
+
+********** ERROR ***********
+""".strip()
+            )
+
+    @classmethod
+    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
+        """
+        Args:
+            value (str): The version string to parse.
+
+        Returns:
+            tuple[str, str]: A tuple containing provider name and version.
+
+        Raises:
+            argparse.ArgumentTypeError: If the version string is invalid.
+        """
+        VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+|latest)$"
+        match_result = re.match(VERSION_PATTERN, value)
+        if match_result is None:
+            raise argparse.ArgumentTypeError(f"Invalid version string: {value}")
+
+        return match_result.group(1), match_result.group(2), match_result.group(3)
+
+    @classmethod
+    def init_frontend_unsafe(
+        cls, version_string: str, provider: Optional[FrontEndProvider] = None
+    ) -> str:
+        """
+        Initializes the frontend for the specified version.
+
+        Args:
+            version_string (str): The version string.
+            provider (FrontEndProvider, optional): The provider to use. Defaults to None.
+
+        Returns:
+            str: The path to the initialized frontend.
+
+        Raises:
+            Exception: If there is an error during the initialization process.
+            main error source might be request timeout or invalid URL.
+        """
+        if version_string == DEFAULT_VERSION_STRING:
+            check_frontend_version()
+            return cls.default_frontend_path()
+
+        repo_owner, repo_name, version = cls.parse_version_string(version_string)
+
+        if version.startswith("v"):
+            expected_path = str(
+                Path(cls.CUSTOM_FRONTENDS_ROOT)
+                / f"{repo_owner}_{repo_name}"
+                / version.lstrip("v")
+            )
+            if os.path.exists(expected_path):
+                logging.info(
+                    f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}"
+                )
+                return expected_path
+
+        logging.info(
+            f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub..."
+        )
+
+        provider = provider or FrontEndProvider(repo_owner, repo_name)
+        release = provider.get_release(version)
+
+        semantic_version = release["tag_name"].lstrip("v")
+        web_root = str(
+            Path(cls.CUSTOM_FRONTENDS_ROOT) / provider.folder_name / semantic_version
+        )
+        if not os.path.exists(web_root):
+            try:
+                os.makedirs(web_root, exist_ok=True)
+                logging.info(
+                    "Downloading frontend(%s) version(%s) to (%s)",
+                    provider.folder_name,
+                    semantic_version,
+                    web_root,
+                )
+                logging.debug(release)
+                download_release_asset_zip(release, destination_path=web_root)
+            finally:
+                # Clean up the directory if it is empty, i.e. the download failed
+                if not os.listdir(web_root):
+                    os.rmdir(web_root)
+
+        return web_root
+
+    @classmethod
+    def init_frontend(cls, version_string: str) -> str:
+        """
+        Initializes the frontend with the specified version string.
+
+        Args:
+            version_string (str): The version string to initialize the frontend with.
+
+        Returns:
+            str: The path of the initialized frontend.
+        """
+        try:
+            return cls.init_frontend_unsafe(version_string)
+        except Exception as e:
+            logging.error("Failed to initialize frontend: %s", e)
+            logging.info("Falling back to the default frontend.")
+            check_frontend_version()
+            return cls.default_frontend_path()
--- a/app/logger.py
+++ b/app/logger.py
@@ -0,0 +1,98 @@
+from collections import deque
+from datetime import datetime
+import io
+import logging
+import sys
+import threading
+
+logs = None
+stdout_interceptor = None
+stderr_interceptor = None
+
+
+class LogInterceptor(io.TextIOWrapper):
+    def __init__(self, stream,  *args, **kwargs):
+        buffer = stream.buffer
+        encoding = stream.encoding
+        super().__init__(buffer, *args, **kwargs, encoding=encoding, line_buffering=stream.line_buffering)
+        self._lock = threading.Lock()
+        self._flush_callbacks = []
+        self._logs_since_flush = []
+
+    def write(self, data):
+        entry = {"t": datetime.now().isoformat(), "m": data}
+        with self._lock:
+            self._logs_since_flush.append(entry)
+
+            # Simple handling for cr to overwrite the last output if it isnt a full line
+            # else logs just get full of progress messages
+            if isinstance(data, str) and data.startswith("\r") and not logs[-1]["m"].endswith("\n"):
+                logs.pop()
+            logs.append(entry)
+        super().write(data)
+
+    def flush(self):
+        super().flush()
+        for cb in self._flush_callbacks:
+            cb(self._logs_since_flush)
+            self._logs_since_flush = []
+
+    def on_flush(self, callback):
+        self._flush_callbacks.append(callback)
+
+
+def get_logs():
+    return logs
+
+
+def on_flush(callback):
+    if stdout_interceptor is not None:
+        stdout_interceptor.on_flush(callback)
+    if stderr_interceptor is not None:
+        stderr_interceptor.on_flush(callback)
+
+def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool = False):
+    global logs
+    if logs:
+        return
+
+    # Override output streams and log to buffer
+    logs = deque(maxlen=capacity)
+
+    global stdout_interceptor
+    global stderr_interceptor
+    stdout_interceptor = sys.stdout = LogInterceptor(sys.stdout)
+    stderr_interceptor = sys.stderr = LogInterceptor(sys.stderr)
+
+    # Setup default global logger
+    logger = logging.getLogger()
+    logger.setLevel(log_level)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(logging.Formatter("%(message)s"))
+
+    if use_stdout:
+        # Only errors and critical to stderr
+        stream_handler.addFilter(lambda record: not record.levelno < logging.ERROR)
+
+        # Lesser to stdout
+        stdout_handler = logging.StreamHandler(sys.stdout)
+        stdout_handler.setFormatter(logging.Formatter("%(message)s"))
+        stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
+        logger.addHandler(stdout_handler)
+
+    logger.addHandler(stream_handler)
+
+
+STARTUP_WARNINGS = []
+
+
+def log_startup_warning(msg):
+    logging.warning(msg)
+    STARTUP_WARNINGS.append(msg)
+
+
+def print_startup_warnings():
+    for s in STARTUP_WARNINGS:
+        logging.warning(s)
+    STARTUP_WARNINGS.clear()
--- a/app/model_manager.py
+++ b/app/model_manager.py
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+import os
+import base64
+import json
+import time
+import logging
+import folder_paths
+import glob
+import comfy.utils
+from aiohttp import web
+from PIL import Image
+from io import BytesIO
+from folder_paths import map_legacy, filter_files_extensions, filter_files_content_types
+
+
+class ModelFileManager:
+    def __init__(self) -> None:
+        self.cache: dict[str, tuple[list[dict], dict[str, float], float]] = {}
+
+    def get_cache(self, key: str, default=None) -> tuple[list[dict], dict[str, float], float] | None:
+        return self.cache.get(key, default)
+
+    def set_cache(self, key: str, value: tuple[list[dict], dict[str, float], float]):
+        self.cache[key] = value
+
+    def clear_cache(self):
+        self.cache.clear()
+
+    def add_routes(self, routes):
+        # NOTE: This is an experiment to replace `/models`
+        @routes.get("/experiment/models")
+        async def get_model_folders(request):
+            model_types = list(folder_paths.folder_names_and_paths.keys())
+            folder_black_list = ["configs", "custom_nodes"]
+            output_folders: list[dict] = []
+            for folder in model_types:
+                if folder in folder_black_list:
+                    continue
+                output_folders.append({"name": folder, "folders": folder_paths.get_folder_paths(folder)})
+            return web.json_response(output_folders)
+
+        # NOTE: This is an experiment to replace `/models/{folder}`
+        @routes.get("/experiment/models/{folder}")
+        async def get_all_models(request):
+            folder = request.match_info.get("folder", None)
+            if not folder in folder_paths.folder_names_and_paths:
+                return web.Response(status=404)
+            files = self.get_model_file_list(folder)
+            return web.json_response(files)
+
+        @routes.get("/experiment/models/preview/{folder}/{path_index}/{filename:.*}")
+        async def get_model_preview(request):
+            folder_name = request.match_info.get("folder", None)
+            path_index = int(request.match_info.get("path_index", None))
+            filename = request.match_info.get("filename", None)
+
+            if not folder_name in folder_paths.folder_names_and_paths:
+                return web.Response(status=404)
+
+            folders = folder_paths.folder_names_and_paths[folder_name]
+            folder = folders[0][path_index]
+            full_filename = os.path.join(folder, filename)
+
+            previews = self.get_model_previews(full_filename)
+            default_preview = previews[0] if len(previews) > 0 else None
+            if default_preview is None or (isinstance(default_preview, str) and not os.path.isfile(default_preview)):
+                return web.Response(status=404)
+
+            try:
+                with Image.open(default_preview) as img:
+                    img_bytes = BytesIO()
+                    img.save(img_bytes, format="WEBP")
+                    img_bytes.seek(0)
+                    return web.Response(body=img_bytes.getvalue(), content_type="image/webp")
+            except:
+                return web.Response(status=404)
+
+    def get_model_file_list(self, folder_name: str):
+        folder_name = map_legacy(folder_name)
+        folders = folder_paths.folder_names_and_paths[folder_name]
+        output_list: list[dict] = []
+
+        for index, folder in enumerate(folders[0]):
+            if not os.path.isdir(folder):
+                continue
+            out = self.cache_model_file_list_(folder)
+            if out is None:
+                out = self.recursive_search_models_(folder, index)
+                self.set_cache(folder, out)
+            output_list.extend(out[0])
+
+        return output_list
+
+    def cache_model_file_list_(self, folder: str):
+        model_file_list_cache = self.get_cache(folder)
+
+        if model_file_list_cache is None:
+            return None
+        if not os.path.isdir(folder):
+            return None
+        if os.path.getmtime(folder) != model_file_list_cache[1]:
+            return None
+        for x in model_file_list_cache[1]:
+            time_modified = model_file_list_cache[1][x]
+            folder = x
+            if os.path.getmtime(folder) != time_modified:
+                return None
+
+        return model_file_list_cache
+
+    def recursive_search_models_(self, directory: str, pathIndex: int) -> tuple[list[str], dict[str, float], float]:
+        if not os.path.isdir(directory):
+            return [], {}, time.perf_counter()
+
+        excluded_dir_names = [".git"]
+        # TODO use settings
+        include_hidden_files = False
+
+        result: list[str] = []
+        dirs: dict[str, float] = {}
+
+        for dirpath, subdirs, filenames in os.walk(directory, followlinks=True, topdown=True):
+            subdirs[:] = [d for d in subdirs if d not in excluded_dir_names]
+            if not include_hidden_files:
+                subdirs[:] = [d for d in subdirs if not d.startswith(".")]
+                filenames = [f for f in filenames if not f.startswith(".")]
+
+            filenames = filter_files_extensions(filenames, folder_paths.supported_pt_extensions)
+
+            for file_name in filenames:
+                try:
+                    relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory)
+                    result.append(relative_path)
+                except:
+                    logging.warning(f"Warning: Unable to access {file_name}. Skipping this file.")
+                    continue
+
+            for d in subdirs:
+                path: str = os.path.join(dirpath, d)
+                try:
+                    dirs[path] = os.path.getmtime(path)
+                except FileNotFoundError:
+                    logging.warning(f"Warning: Unable to access {path}. Skipping this path.")
+                    continue
+
+        return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter()
+
+    def get_model_previews(self, filepath: str) -> list[str | BytesIO]:
+        dirname = os.path.dirname(filepath)
+
+        if not os.path.exists(dirname):
+            return []
+
+        basename = os.path.splitext(filepath)[0]
+        match_files = glob.glob(f"{basename}.*", recursive=False)
+        image_files = filter_files_content_types(match_files, "image")
+        safetensors_file = next(filter(lambda x: x.endswith(".safetensors"), match_files), None)
+        safetensors_metadata = {}
+
+        result: list[str | BytesIO] = []
+
+        for filename in image_files:
+            _basename = os.path.splitext(filename)[0]
+            if _basename == basename:
+                result.append(filename)
+            if _basename == f"{basename}.preview":
+                result.append(filename)
+
+        if safetensors_file:
+            safetensors_filepath = os.path.join(dirname, safetensors_file)
+            header = comfy.utils.safetensors_header(safetensors_filepath, max_size=8*1024*1024)
+            if header:
+                safetensors_metadata = json.loads(header)
+        safetensors_images = safetensors_metadata.get("__metadata__", {}).get("ssmd_cover_images", None)
+        if safetensors_images:
+            safetensors_images = json.loads(safetensors_images)
+            for image in safetensors_images:
+                result.append(BytesIO(base64.b64decode(image)))
+
+        return result
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.clear_cache()
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -0,0 +1,436 @@
+from __future__ import annotations
+import json
+import os
+import re
+import uuid
+import glob
+import shutil
+import logging
+from aiohttp import web
+from urllib import parse
+from comfy.cli_args import args
+import folder_paths
+from .app_settings import AppSettings
+from typing import TypedDict
+
+default_user = "default"
+
+
+class FileInfo(TypedDict):
+    path: str
+    size: int
+    modified: int
+
+
+def get_file_info(path: str, relative_to: str) -> FileInfo:
+    return {
+        "path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
+        "size": os.path.getsize(path),
+        "modified": os.path.getmtime(path)
+    }
+
+
+class UserManager():
+    def __init__(self):
+        user_directory = folder_paths.get_user_directory()
+
+        self.settings = AppSettings(self)
+        if not os.path.exists(user_directory):
+            os.makedirs(user_directory, exist_ok=True)
+            if not args.multi_user:
+                logging.warning("****** User settings have been changed to be stored on the server instead of browser storage. ******")
+                logging.warning("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")
+
+        if args.multi_user:
+            if os.path.isfile(self.get_users_file()):
+                with open(self.get_users_file()) as f:
+                    self.users = json.load(f)
+            else:
+                self.users = {}
+        else:
+            self.users = {"default": "default"}
+
+    def get_users_file(self):
+        return os.path.join(folder_paths.get_user_directory(), "users.json")
+
+    def get_request_user_id(self, request):
+        user = "default"
+        if args.multi_user and "comfy-user" in request.headers:
+            user = request.headers["comfy-user"]
+
+        if user not in self.users:
+            raise KeyError("Unknown user: " + user)
+
+        return user
+
+    def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
+        user_directory = folder_paths.get_user_directory()
+
+        if type == "userdata":
+            root_dir = user_directory
+        else:
+            raise KeyError("Unknown filepath type:" + type)
+
+        user = self.get_request_user_id(request)
+        path = user_root = os.path.abspath(os.path.join(root_dir, user))
+
+        # prevent leaving /{type}
+        if os.path.commonpath((root_dir, user_root)) != root_dir:
+            return None
+
+        if file is not None:
+            # Check if filename is url encoded
+            if "%" in file:
+                file = parse.unquote(file)
+
+            # prevent leaving /{type}/{user}
+            path = os.path.abspath(os.path.join(user_root, file))
+            if os.path.commonpath((user_root, path)) != user_root:
+                return None
+
+        parent = os.path.split(path)[0]
+
+        if create_dir and not os.path.exists(parent):
+            os.makedirs(parent, exist_ok=True)
+
+        return path
+
+    def add_user(self, name):
+        name = name.strip()
+        if not name:
+            raise ValueError("username not provided")
+        user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
+        user_id = user_id + "_" + str(uuid.uuid4())
+
+        self.users[user_id] = name
+
+        with open(self.get_users_file(), "w") as f:
+            json.dump(self.users, f)
+
+        return user_id
+
+    def add_routes(self, routes):
+        self.settings.add_routes(routes)
+
+        @routes.get("/users")
+        async def get_users(request):
+            if args.multi_user:
+                return web.json_response({"storage": "server", "users": self.users})
+            else:
+                user_dir = self.get_request_user_filepath(request, None, create_dir=False)
+                return web.json_response({
+                    "storage": "server",
+                    "migrated": os.path.exists(user_dir)
+                })
+
+        @routes.post("/users")
+        async def post_users(request):
+            body = await request.json()
+            username = body["username"]
+            if username in self.users.values():
+                return web.json_response({"error": "Duplicate username."}, status=400)
+
+            user_id = self.add_user(username)
+            return web.json_response(user_id)
+
+        @routes.get("/userdata")
+        async def listuserdata(request):
+            """
+            List user data files in a specified directory.
+
+            This endpoint allows listing files in a user's data directory, with options for recursion,
+            full file information, and path splitting.
+
+            Query Parameters:
+            - dir (required): The directory to list files from.
+            - recurse (optional): If "true", recursively list files in subdirectories.
+            - full_info (optional): If "true", return detailed file information (path, size, modified time).
+            - split (optional): If "true", split file paths into components (only applies when full_info is false).
+
+            Returns:
+            - 400: If 'dir' parameter is missing.
+            - 403: If the requested path is not allowed.
+            - 404: If the requested directory does not exist.
+            - 200: JSON response with the list of files or file information.
+
+            The response format depends on the query parameters:
+            - Default: List of relative file paths.
+            - full_info=true: List of dictionaries with file details.
+            - split=true (and full_info=false): List of lists, each containing path components.
+            """
+            directory = request.rel_url.query.get('dir', '')
+            if not directory:
+                return web.Response(status=400, text="Directory not provided")
+
+            path = self.get_request_user_filepath(request, directory)
+            if not path:
+                return web.Response(status=403, text="Invalid directory")
+
+            if not os.path.exists(path):
+                return web.Response(status=404, text="Directory not found")
+
+            recurse = request.rel_url.query.get('recurse', '').lower() == "true"
+            full_info = request.rel_url.query.get('full_info', '').lower() == "true"
+            split_path = request.rel_url.query.get('split', '').lower() == "true"
+
+            # Use different patterns based on whether we're recursing or not
+            if recurse:
+                pattern = os.path.join(glob.escape(path), '**', '*')
+            else:
+                pattern = os.path.join(glob.escape(path), '*')
+
+            def process_full_path(full_path: str) -> FileInfo | str | list[str]:
+                if full_info:
+                    return get_file_info(full_path, path)
+
+                rel_path = os.path.relpath(full_path, path).replace(os.sep, '/')
+                if split_path:
+                    return [rel_path] + rel_path.split('/')
+
+                return rel_path
+
+            results = [
+                process_full_path(full_path)
+                for full_path in glob.glob(pattern, recursive=recurse)
+                if os.path.isfile(full_path)
+            ]
+
+            return web.json_response(results)
+
+        @routes.get("/v2/userdata")
+        async def list_userdata_v2(request):
+            """
+            List files and directories in a user's data directory.
+
+            This endpoint provides a structured listing of contents within a specified
+            subdirectory of the user's data storage.
+
+            Query Parameters:
+            - path (optional): The relative path within the user's data directory
+                               to list. Defaults to the root ('').
+
+            Returns:
+            - 400: If the requested path is invalid, outside the user's data directory, or is not a directory.
+            - 404: If the requested path does not exist.
+            - 403: If the user is invalid.
+            - 500: If there is an error reading the directory contents.
+            - 200: JSON response containing a list of file and directory objects.
+                   Each object includes:
+                   - name: The name of the file or directory.
+                   - type: 'file' or 'directory'.
+                   - path: The relative path from the user's data root.
+                   - size (for files): The size in bytes.
+                   - modified (for files): The last modified timestamp (Unix epoch).
+            """
+            requested_rel_path = request.rel_url.query.get('path', '')
+
+            # URL-decode the path parameter
+            try:
+                requested_rel_path = parse.unquote(requested_rel_path)
+            except Exception as e:
+                logging.warning(f"Failed to decode path parameter: {requested_rel_path}, Error: {e}")
+                return web.Response(status=400, text="Invalid characters in path parameter")
+
+
+            # Check user validity and get the absolute path for the requested directory
+            try:
+                 base_user_path = self.get_request_user_filepath(request, None, create_dir=False)
+
+                 if requested_rel_path:
+                     target_abs_path = self.get_request_user_filepath(request, requested_rel_path, create_dir=False)
+                 else:
+                     target_abs_path = base_user_path
+
+            except KeyError as e:
+                 # Invalid user detected by get_request_user_id inside get_request_user_filepath
+                 logging.warning(f"Access denied for user: {e}")
+                 return web.Response(status=403, text="Invalid user specified in request")
+
+
+            if not target_abs_path:
+                 # Path traversal or other issue detected by get_request_user_filepath
+                 return web.Response(status=400, text="Invalid path requested")
+
+            # Handle cases where the user directory or target path doesn't exist
+            if not os.path.exists(target_abs_path):
+                # Check if it's the base user directory that's missing (new user case)
+                if target_abs_path == base_user_path:
+                    # It's okay if the base user directory doesn't exist yet, return empty list
+                     return web.json_response([])
+                else:
+                    # A specific subdirectory was requested but doesn't exist
+                     return web.Response(status=404, text="Requested path not found")
+
+            if not os.path.isdir(target_abs_path):
+                 return web.Response(status=400, text="Requested path is not a directory")
+
+            results = []
+            try:
+                for root, dirs, files in os.walk(target_abs_path, topdown=True):
+                    # Process directories
+                    for dir_name in dirs:
+                        dir_path = os.path.join(root, dir_name)
+                        rel_path = os.path.relpath(dir_path, base_user_path).replace(os.sep, '/')
+                        results.append({
+                            "name": dir_name,
+                            "path": rel_path,
+                            "type": "directory"
+                        })
+
+                    # Process files
+                    for file_name in files:
+                        file_path = os.path.join(root, file_name)
+                        rel_path = os.path.relpath(file_path, base_user_path).replace(os.sep, '/')
+                        entry_info = {
+                            "name": file_name,
+                            "path": rel_path,
+                            "type": "file"
+                        }
+                        try:
+                            stats = os.stat(file_path) # Use os.stat for potentially better performance with os.walk
+                            entry_info["size"] = stats.st_size
+                            entry_info["modified"] = stats.st_mtime
+                        except OSError as stat_error:
+                            logging.warning(f"Could not stat file {file_path}: {stat_error}")
+                            pass # Include file with available info
+                        results.append(entry_info)
+            except OSError as e:
+                logging.error(f"Error listing directory {target_abs_path}: {e}")
+                return web.Response(status=500, text="Error reading directory contents")
+
+            # Sort results alphabetically, directories first then files
+            results.sort(key=lambda x: (x['type'] != 'directory', x['name'].lower()))
+
+            return web.json_response(results)
+
+        def get_user_data_path(request, check_exists = False, param = "file"):
+            file = request.match_info.get(param, None)
+            if not file:
+                return web.Response(status=400)
+
+            path = self.get_request_user_filepath(request, file)
+            if not path:
+                return web.Response(status=403)
+
+            if check_exists and not os.path.exists(path):
+                return web.Response(status=404)
+
+            return path
+
+        @routes.get("/userdata/{file}")
+        async def getuserdata(request):
+            path = get_user_data_path(request, check_exists=True)
+            if not isinstance(path, str):
+                return path
+
+            return web.FileResponse(path)
+
+        @routes.post("/userdata/{file}")
+        async def post_userdata(request):
+            """
+            Upload or update a user data file.
+
+            This endpoint handles file uploads to a user's data directory, with options for
+            controlling overwrite behavior and response format.
+
+            Query Parameters:
+            - overwrite (optional): If "false", prevents overwriting existing files. Defaults to "true".
+            - full_info (optional): If "true", returns detailed file information (path, size, modified time).
+                                  If "false", returns only the relative file path.
+
+            Path Parameters:
+            - file: The target file path (URL encoded if necessary).
+
+            Returns:
+            - 400: If 'file' parameter is missing.
+            - 403: If the requested path is not allowed.
+            - 409: If overwrite=false and the file already exists.
+            - 200: JSON response with either:
+                  - Full file information (if full_info=true)
+                  - Relative file path (if full_info=false)
+
+            The request body should contain the raw file content to be written.
+            """
+            path = get_user_data_path(request)
+            if not isinstance(path, str):
+                return path
+
+            overwrite = request.query.get("overwrite", 'true') != "false"
+            full_info = request.query.get('full_info', 'false').lower() == "true"
+
+            if not overwrite and os.path.exists(path):
+                return web.Response(status=409, text="File already exists")
+
+            body = await request.read()
+
+            with open(path, "wb") as f:
+                f.write(body)
+
+            user_path = self.get_request_user_filepath(request, None)
+            if full_info:
+                resp = get_file_info(path, user_path)
+            else:
+                resp = os.path.relpath(path, user_path)
+
+            return web.json_response(resp)
+
+        @routes.delete("/userdata/{file}")
+        async def delete_userdata(request):
+            path = get_user_data_path(request, check_exists=True)
+            if not isinstance(path, str):
+                return path
+
+            os.remove(path)
+
+            return web.Response(status=204)
+
+        @routes.post("/userdata/{file}/move/{dest}")
+        async def move_userdata(request):
+            """
+            Move or rename a user data file.
+
+            This endpoint handles moving or renaming files within a user's data directory, with options for
+            controlling overwrite behavior and response format.
+
+            Path Parameters:
+            - file: The source file path (URL encoded if necessary)
+            - dest: The destination file path (URL encoded if necessary)
+
+            Query Parameters:
+            - overwrite (optional): If "false", prevents overwriting existing files. Defaults to "true".
+            - full_info (optional): If "true", returns detailed file information (path, size, modified time).
+                                  If "false", returns only the relative file path.
+
+            Returns:
+            - 400: If either 'file' or 'dest' parameter is missing
+            - 403: If either requested path is not allowed
+            - 404: If the source file does not exist
+            - 409: If overwrite=false and the destination file already exists
+            - 200: JSON response with either:
+                  - Full file information (if full_info=true)
+                  - Relative file path (if full_info=false)
+            """
+            source = get_user_data_path(request, check_exists=True)
+            if not isinstance(source, str):
+                return source
+
+            dest = get_user_data_path(request, check_exists=False, param="dest")
+            if not isinstance(source, str):
+                return dest
+
+            overwrite = request.query.get("overwrite", 'true') != "false"
+            full_info = request.query.get('full_info', 'false').lower() == "true"
+
+            if not overwrite and os.path.exists(dest):
+                return web.Response(status=409, text="File already exists")
+
+            logging.info(f"moving '{source}' -> '{dest}'")
+            shutil.move(source, dest)
+
+            user_path = self.get_request_user_filepath(request, None)
+            if full_info:
+                resp = get_file_info(dest, user_path)
+            else:
+                resp = os.path.relpath(dest, user_path)
+
+            return web.json_response(resp)
--- a/comfy/checkpoint_pickle.py
+++ b/comfy/checkpoint_pickle.py
@@ -0,0 +1,13 @@
+import pickle
+
+load = pickle.load
+
+class Empty:
+    pass
+
+class Unpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        #TODO: safe unpickle
+        if module.startswith("pytorch_lightning"):
+            return Empty
+        return super().find_class(module, name)
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@@ -2,21 +2,56 @@
 #and modified

 import torch
-import torch as th
 import torch.nn as nn

 from ..ldm.modules.diffusionmodules.util import (
-    conv_nd,
-    linear,
-    zero_module,
    timestep_embedding,
 )

 from ..ldm.modules.attention import SpatialTransformer
-from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
-from ..ldm.models.diffusion.ddpm import LatentDiffusion
-from ..ldm.util import log_txt_as_img, exists, instantiate_from_config
+from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
+from ..ldm.util import exists
+from .control_types import UNION_CONTROLNET_TYPES
+from collections import OrderedDict
+import comfy.ops
+from comfy.ldm.modules.attention import optimized_attention

+class OptimizedAttention(nn.Module):
+    def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.heads = nhead
+        self.c = c
+
+        self.in_proj = operations.Linear(c, c * 3, bias=True, dtype=dtype, device=device)
+        self.out_proj = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = self.in_proj(x)
+        q, k, v = x.split(self.c, dim=2)
+        out = optimized_attention(q, k, v, self.heads)
+        return self.out_proj(out)
+
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+class ResBlockUnionControlnet(nn.Module):
+    def __init__(self, dim, nhead, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.attn = OptimizedAttention(dim, nhead, dtype=dtype, device=device, operations=operations)
+        self.ln_1 = operations.LayerNorm(dim, dtype=dtype, device=device)
+        self.mlp = nn.Sequential(
+            OrderedDict([("c_fc", operations.Linear(dim, dim * 4, dtype=dtype, device=device)), ("gelu", QuickGELU()),
+                         ("c_proj", operations.Linear(dim * 4, dim, dtype=dtype, device=device))]))
+        self.ln_2 = operations.LayerNorm(dim, dtype=dtype, device=device)
+
+    def attention(self, x: torch.Tensor):
+        return self.attn(x)
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x

 class ControlledUnetModel(UNetModel):
    #implemented in the ldm unet
@@ -30,13 +65,13 @@ class ControlNet(nn.Module):
        model_channels,
        hint_channels,
        num_res_blocks,
-        attention_resolutions,
        dropout=0,
        channel_mult=(1, 2, 4, 8),
        conv_resample=True,
        dims=2,
+        num_classes=None,
        use_checkpoint=False,
-        use_fp16=False,
+        dtype=torch.float32,
        num_heads=-1,
        num_head_channels=-1,
        num_heads_upsample=-1,
@@ -52,8 +87,17 @@ class ControlNet(nn.Module):
        num_attention_blocks=None,
        disable_middle_self_attn=False,
        use_linear_in_transformer=False,
+        adm_in_channels=None,
+        transformer_depth_middle=None,
+        transformer_depth_output=None,
+        attn_precision=None,
+        union_controlnet_num_control_type=None,
+        device=None,
+        operations=comfy.ops.disable_weight_init,
+        **kwargs,
    ):
        super().__init__()
+        assert use_spatial_transformer == True, "use_spatial_transformer has to be true"
        if use_spatial_transformer:
            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'

@@ -76,6 +120,7 @@ class ControlNet(nn.Module):
        self.image_size = image_size
        self.in_channels = in_channels
        self.model_channels = model_channels
+
        if isinstance(num_res_blocks, int):
            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
        else:
@@ -83,23 +128,22 @@ class ControlNet(nn.Module):
                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
                                 "as a list/tuple (per-level) with the same length as channel_mult")
            self.num_res_blocks = num_res_blocks
+
        if disable_self_attentions is not None:
            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
            assert len(disable_self_attentions) == len(channel_mult)
        if num_attention_blocks is not None:
            assert len(num_attention_blocks) == len(self.num_res_blocks)
            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
-            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                  f"attention will still not be set.")

-        self.attention_resolutions = attention_resolutions
+        transformer_depth = transformer_depth[:]
+
        self.dropout = dropout
        self.channel_mult = channel_mult
        self.conv_resample = conv_resample
+        self.num_classes = num_classes
        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
+        self.dtype = dtype
        self.num_heads = num_heads
        self.num_head_channels = num_head_channels
        self.num_heads_upsample = num_heads_upsample
@@ -107,36 +151,53 @@ class ControlNet(nn.Module):

        time_embed_dim = model_channels * 4
        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
+            operations.Linear(model_channels, time_embed_dim, dtype=self.dtype, device=device),
            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
+            operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
        )

+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        operations.Linear(adm_in_channels, time_embed_dim, dtype=self.dtype, device=device),
+                        nn.SiLU(),
+                        operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
+                    )
+                )
+            else:
+                raise ValueError()
+
        self.input_blocks = nn.ModuleList(
            [
                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                    operations.conv_nd(dims, in_channels, model_channels, 3, padding=1, dtype=self.dtype, device=device)
                )
            ]
        )
-        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels, operations=operations, dtype=self.dtype, device=device)])

        self.input_hint_block = TimestepEmbedSequential(
-                    conv_nd(dims, hint_channels, 16, 3, padding=1),
+                    operations.conv_nd(dims, hint_channels, 16, 3, padding=1, dtype=self.dtype, device=device),
                    nn.SiLU(),
-                    conv_nd(dims, 16, 16, 3, padding=1),
+                    operations.conv_nd(dims, 16, 16, 3, padding=1, dtype=self.dtype, device=device),
                    nn.SiLU(),
-                    conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+                    operations.conv_nd(dims, 16, 32, 3, padding=1, stride=2, dtype=self.dtype, device=device),
                    nn.SiLU(),
-                    conv_nd(dims, 32, 32, 3, padding=1),
+                    operations.conv_nd(dims, 32, 32, 3, padding=1, dtype=self.dtype, device=device),
                    nn.SiLU(),
-                    conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+                    operations.conv_nd(dims, 32, 96, 3, padding=1, stride=2, dtype=self.dtype, device=device),
                    nn.SiLU(),
-                    conv_nd(dims, 96, 96, 3, padding=1),
+                    operations.conv_nd(dims, 96, 96, 3, padding=1, dtype=self.dtype, device=device),
                    nn.SiLU(),
-                    conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+                    operations.conv_nd(dims, 96, 256, 3, padding=1, stride=2, dtype=self.dtype, device=device),
                    nn.SiLU(),
-                    zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+                    operations.conv_nd(dims, 256, model_channels, 3, padding=1, dtype=self.dtype, device=device)
        )

        self._feature_size = model_channels
@@ -154,10 +215,14 @@ class ControlNet(nn.Module):
                        dims=dims,
                        use_checkpoint=use_checkpoint,
                        use_scale_shift_norm=use_scale_shift_norm,
+                        dtype=self.dtype,
+                        device=device,
+                        operations=operations,
                    )
                ]
                ch = mult * model_channels
-                if ds in attention_resolutions:
+                num_transformers = transformer_depth.pop(0)
+                if num_transformers > 0:
                    if num_head_channels == -1:
                        dim_head = ch // num_heads
                    else:
@@ -173,20 +238,14 @@ class ControlNet(nn.Module):

                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
                        layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            ) if not use_spatial_transformer else SpatialTransformer(
-                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                            SpatialTransformer(
+                                ch, num_heads, dim_head, depth=num_transformers, context_dim=context_dim,
                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
-                                use_checkpoint=use_checkpoint
+                                use_checkpoint=use_checkpoint, attn_precision=attn_precision, dtype=self.dtype, device=device, operations=operations
                            )
                        )
                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self.zero_convs.append(self.make_zero_conv(ch))
+                self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
                self._feature_size += ch
                input_block_chans.append(ch)
            if level != len(channel_mult) - 1:
@@ -202,16 +261,19 @@ class ControlNet(nn.Module):
                            use_checkpoint=use_checkpoint,
                            use_scale_shift_norm=use_scale_shift_norm,
                            down=True,
+                            dtype=self.dtype,
+                            device=device,
+                            operations=operations
                        )
                        if resblock_updown
                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
+                            ch, conv_resample, dims=dims, out_channels=out_ch, dtype=self.dtype, device=device, operations=operations
                        )
                    )
                )
                ch = out_ch
                input_block_chans.append(ch)
-                self.zero_convs.append(self.make_zero_conv(ch))
+                self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
                ds *= 2
                self._feature_size += ch

@@ -223,7 +285,7 @@ class ControlNet(nn.Module):
        if legacy:
            #num_heads = 1
            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        self.middle_block = TimestepEmbedSequential(
+        mid_block = [
            ResBlock(
                ch,
                time_embed_dim,
@@ -231,17 +293,15 @@ class ControlNet(nn.Module):
                dims=dims,
                use_checkpoint=use_checkpoint,
                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(  # always uses a self-attn
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                dtype=self.dtype,
+                device=device,
+                operations=operations
+            )]
+        if transformer_depth_middle >= 0:
+            mid_block += [SpatialTransformer(  # always uses a self-attn
+                            ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
-                            use_checkpoint=use_checkpoint
+                            use_checkpoint=use_checkpoint, attn_precision=attn_precision, dtype=self.dtype, device=device, operations=operations
                        ),
            ResBlock(
                ch,
@@ -250,23 +310,113 @@ class ControlNet(nn.Module):
                dims=dims,
                use_checkpoint=use_checkpoint,
                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-        )
-        self.middle_block_out = self.make_zero_conv(ch)
+                dtype=self.dtype,
+                device=device,
+                operations=operations
+            )]
+        self.middle_block = TimestepEmbedSequential(*mid_block)
+        self.middle_block_out = self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device)
        self._feature_size += ch

-    def make_zero_conv(self, channels):
-        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+        if union_controlnet_num_control_type is not None:
+            self.num_control_type = union_controlnet_num_control_type
+            num_trans_channel = 320
+            num_trans_head = 8
+            num_trans_layer = 1
+            num_proj_channel = 320
+            # task_scale_factor = num_trans_channel ** 0.5
+            self.task_embedding = nn.Parameter(torch.empty(self.num_control_type, num_trans_channel, dtype=self.dtype, device=device))

-    def forward(self, x, hint, timesteps, context, **kwargs):
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+            self.transformer_layes = nn.Sequential(*[ResBlockUnionControlnet(num_trans_channel, num_trans_head, dtype=self.dtype, device=device, operations=operations) for _ in range(num_trans_layer)])
+            self.spatial_ch_projs = operations.Linear(num_trans_channel, num_proj_channel, dtype=self.dtype, device=device)
+            #-----------------------------------------------------------------------------------------------------
+
+            control_add_embed_dim = 256
+            class ControlAddEmbedding(nn.Module):
+                def __init__(self, in_dim, out_dim, num_control_type, dtype=None, device=None, operations=None):
+                    super().__init__()
+                    self.num_control_type = num_control_type
+                    self.in_dim = in_dim
+                    self.linear_1 = operations.Linear(in_dim * num_control_type, out_dim, dtype=dtype, device=device)
+                    self.linear_2 = operations.Linear(out_dim, out_dim, dtype=dtype, device=device)
+                def forward(self, control_type, dtype, device):
+                    c_type = torch.zeros((self.num_control_type,), device=device)
+                    c_type[control_type] = 1.0
+                    c_type = timestep_embedding(c_type.flatten(), self.in_dim, repeat_only=False).to(dtype).reshape((-1, self.num_control_type * self.in_dim))
+                    return self.linear_2(torch.nn.functional.silu(self.linear_1(c_type)))
+
+            self.control_add_embedding = ControlAddEmbedding(control_add_embed_dim, time_embed_dim, self.num_control_type, dtype=self.dtype, device=device, operations=operations)
+        else:
+            self.task_embedding = None
+            self.control_add_embedding = None
+
+    def union_controlnet_merge(self, hint, control_type, emb, context):
+        # Equivalent to: https://github.com/xinsir6/ControlNetPlus/tree/main
+        inputs = []
+        condition_list = []
+
+        for idx in range(min(1, len(control_type))):
+            controlnet_cond = self.input_hint_block(hint[idx], emb, context)
+            feat_seq = torch.mean(controlnet_cond, dim=(2, 3))
+            if idx < len(control_type):
+                feat_seq += self.task_embedding[control_type[idx]].to(dtype=feat_seq.dtype, device=feat_seq.device)
+
+            inputs.append(feat_seq.unsqueeze(1))
+            condition_list.append(controlnet_cond)
+
+        x = torch.cat(inputs, dim=1)
+        x = self.transformer_layes(x)
+        controlnet_cond_fuser = None
+        for idx in range(len(control_type)):
+            alpha = self.spatial_ch_projs(x[:, idx])
+            alpha = alpha.unsqueeze(-1).unsqueeze(-1)
+            o = condition_list[idx] + alpha
+            if controlnet_cond_fuser is None:
+                controlnet_cond_fuser = o
+            else:
+                controlnet_cond_fuser += o
+        return controlnet_cond_fuser
+
+    def make_zero_conv(self, channels, operations=None, dtype=None, device=None):
+        return TimestepEmbedSequential(operations.conv_nd(self.dims, channels, channels, 1, padding=0, dtype=dtype, device=device))
+
+    def forward(self, x, hint, timesteps, context, y=None, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
        emb = self.time_embed(t_emb)

-        guided_hint = self.input_hint_block(hint, emb, context)
+        guided_hint = None
+        if self.control_add_embedding is not None: #Union Controlnet
+            control_type = kwargs.get("control_type", [])

-        outs = []
+            if any([c >= self.num_control_type for c in control_type]):
+                max_type = max(control_type)
+                max_type_name = {
+                    v: k for k, v in UNION_CONTROLNET_TYPES.items()
+                }[max_type]
+                raise ValueError(
+                    f"Control type {max_type_name}({max_type}) is out of range for the number of control types" +
+                    f"({self.num_control_type}) supported.\n" +
+                    "Please consider using the ProMax ControlNet Union model.\n" +
+                    "https://huggingface.co/xinsir/controlnet-union-sdxl-1.0/tree/main"
+                )

-        h = x.type(self.dtype)
+            emb += self.control_add_embedding(control_type, emb.dtype, emb.device)
+            if len(control_type) > 0:
+                if len(hint.shape) < 5:
+                    hint = hint.unsqueeze(dim=0)
+                guided_hint = self.union_controlnet_merge(hint, control_type, emb, context)
+
+        if guided_hint is None:
+            guided_hint = self.input_hint_block(hint, emb, context)
+
+        out_output = []
+        out_middle = []
+
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+
+        h = x
        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
            if guided_hint is not None:
                h = module(h, emb, context)
@@ -274,10 +424,10 @@ class ControlNet(nn.Module):
                guided_hint = None
            else:
                h = module(h, emb, context)
-            outs.append(zero_conv(h, emb, context))
+            out_output.append(zero_conv(h, emb, context))

        h = self.middle_block(h, emb, context)
-        outs.append(self.middle_block_out(h, emb, context))
+        out_middle.append(self.middle_block_out(h, emb, context))

-        return outs
+        return {"middle": out_middle, "output": out_output}

--- a/comfy/cldm/control_types.py
+++ b/comfy/cldm/control_types.py
@@ -0,0 +1,10 @@
+UNION_CONTROLNET_TYPES = {
+    "openpose": 0,
+    "depth": 1,
+    "hed/pidi/scribble/ted": 2,
+    "canny/lineart/anime_lineart/mlsd": 3,
+    "normal": 4,
+    "segment": 5,
+    "tile": 6,
+    "repaint": 7,
+}
--- a/comfy/cldm/dit_embedder.py
+++ b/comfy/cldm/dit_embedder.py
@@ -0,0 +1,120 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from comfy.ldm.modules.diffusionmodules.mmdit import DismantledBlock, PatchEmbed, VectorEmbedder, TimestepEmbedder, get_2d_sincos_pos_embed_torch
+
+
+class ControlNetEmbedder(nn.Module):
+
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        attention_head_dim: int,
+        num_attention_heads: int,
+        adm_in_channels: int,
+        num_layers: int,
+        main_model_double: int,
+        double_y_emb: bool,
+        device: torch.device,
+        dtype: torch.dtype,
+        pos_embed_max_size: Optional[int] = None,
+        operations = None,
+    ):
+        super().__init__()
+        self.main_model_double = main_model_double
+        self.dtype = dtype
+        self.hidden_size = num_attention_heads * attention_head_dim
+        self.patch_size = patch_size
+        self.x_embedder = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=self.hidden_size,
+            strict_img_size=pos_embed_max_size is None,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+        self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations)
+
+        self.double_y_emb = double_y_emb
+        if self.double_y_emb:
+            self.orig_y_embedder = VectorEmbedder(
+                adm_in_channels, self.hidden_size, dtype, device, operations=operations
+            )
+            self.y_embedder = VectorEmbedder(
+                self.hidden_size, self.hidden_size, dtype, device, operations=operations
+            )
+        else:
+            self.y_embedder = VectorEmbedder(
+                adm_in_channels, self.hidden_size, dtype, device, operations=operations
+            )
+
+        self.transformer_blocks = nn.ModuleList(
+            DismantledBlock(
+                hidden_size=self.hidden_size, num_heads=num_attention_heads, qkv_bias=True,
+                dtype=dtype, device=device, operations=operations
+            )
+            for _ in range(num_layers)
+        )
+
+        # self.use_y_embedder = pooled_projection_dim != self.time_text_embed.text_embedder.linear_1.in_features
+        # TODO double check this logic when 8b
+        self.use_y_embedder = True
+
+        self.controlnet_blocks = nn.ModuleList([])
+        for _ in range(len(self.transformer_blocks)):
+            controlnet_block = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
+            self.controlnet_blocks.append(controlnet_block)
+
+        self.pos_embed_input = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=self.hidden_size,
+            strict_img_size=False,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        hint = None,
+    ) -> Tuple[Tensor, List[Tensor]]:
+        x_shape = list(x.shape)
+        x = self.x_embedder(x)
+        if not self.double_y_emb:
+            h = (x_shape[-2] + 1) // self.patch_size
+            w = (x_shape[-1] + 1) // self.patch_size
+            x += get_2d_sincos_pos_embed_torch(self.hidden_size, w, h, device=x.device)
+        c = self.t_embedder(timesteps, dtype=x.dtype)
+        if y is not None and self.y_embedder is not None:
+            if self.double_y_emb:
+                y = self.orig_y_embedder(y)
+            y = self.y_embedder(y)
+            c = c + y
+
+        x = x + self.pos_embed_input(hint)
+
+        block_out = ()
+
+        repeat = math.ceil(self.main_model_double / len(self.transformer_blocks))
+        for i in range(len(self.transformer_blocks)):
+            out = self.transformer_blocks[i](x, c)
+            if not self.double_y_emb:
+                x = out
+            block_out += (self.controlnet_blocks[i](out),) * repeat
+
+        return {"output": block_out}
--- a/comfy/cldm/mmdit.py
+++ b/comfy/cldm/mmdit.py
@@ -0,0 +1,81 @@
+import torch
+from typing import Optional
+import comfy.ldm.modules.diffusionmodules.mmdit
+
+class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
+    def __init__(
+        self,
+        num_blocks = None,
+        control_latent_channels = None,
+        dtype = None,
+        device = None,
+        operations = None,
+        **kwargs,
+    ):
+        super().__init__(dtype=dtype, device=device, operations=operations, final_layer=False, num_blocks=num_blocks, **kwargs)
+        # controlnet_blocks
+        self.controlnet_blocks = torch.nn.ModuleList([])
+        for _ in range(len(self.joint_blocks)):
+            self.controlnet_blocks.append(operations.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype))
+
+        if control_latent_channels is None:
+            control_latent_channels = self.in_channels
+
+        self.pos_embed_input = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(
+            None,
+            self.patch_size,
+            control_latent_channels,
+            self.hidden_size,
+            bias=True,
+            strict_img_size=False,
+            dtype=dtype,
+            device=device,
+            operations=operations
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        hint = None,
+    ) -> torch.Tensor:
+
+        #weird sd3 controlnet specific stuff
+        y = torch.zeros_like(y)
+
+        if self.context_processor is not None:
+            context = self.context_processor(context)
+
+        hw = x.shape[-2:]
+        x = self.x_embedder(x) + self.cropped_pos_embed(hw, device=x.device).to(dtype=x.dtype, device=x.device)
+        x += self.pos_embed_input(hint)
+
+        c = self.t_embedder(timesteps, dtype=x.dtype)
+        if y is not None and self.y_embedder is not None:
+            y = self.y_embedder(y)
+            c = c + y
+
+        if context is not None:
+            context = self.context_embedder(context)
+
+        output = []
+
+        blocks = len(self.joint_blocks)
+        for i in range(blocks):
+            context, x = self.joint_blocks[i](
+                context,
+                x,
+                c=c,
+                use_checkpoint=self.use_checkpoint,
+            )
+
+            out = self.controlnet_blocks[i](x)
+            count = self.depth // blocks
+            if i == blocks - 1:
+                count -= 1
+            for j in range(count):
+                output.append(out)
+
+        return {"output": output}
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,36 +1,229 @@
 import argparse
+import enum
+import os
+import comfy.options
+
+
+class EnumAction(argparse.Action):
+    """
+    Argparse action for handling Enums
+    """
+    def __init__(self, **kwargs):
+        # Pop off the type value
+        enum_type = kwargs.pop("type", None)
+
+        # Ensure an Enum subclass is provided
+        if enum_type is None:
+            raise ValueError("type must be assigned an Enum when using EnumAction")
+        if not issubclass(enum_type, enum.Enum):
+            raise TypeError("type must be an Enum when using EnumAction")
+
+        # Generate choices from the Enum
+        choices = tuple(e.value for e in enum_type)
+        kwargs.setdefault("choices", choices)
+        kwargs.setdefault("metavar", f"[{','.join(list(choices))}]")
+
+        super(EnumAction, self).__init__(**kwargs)
+
+        self._enum = enum_type
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        # Convert value back into an Enum
+        value = self._enum(values)
+        setattr(namespace, self.dest, value)
+

 parser = argparse.ArgumentParser()

-parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)")
+parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0,::", help="Specify the IP address to listen on (default: 127.0.0.1). You can give a list of ip addresses by separating them with a comma like: 127.2.2.2,127.3.3.3 If --listen is provided without an argument, it defaults to 0.0.0.0,:: (listens on all ipv4 and ipv6)")
 parser.add_argument("--port", type=int, default=8188, help="Set the listen port.")
+parser.add_argument("--tls-keyfile", type=str, help="Path to TLS (SSL) key file. Enables TLS, makes app accessible at https://... requires --tls-certfile to function")
+parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certificate file. Enables TLS, makes app accessible at https://... requires --tls-keyfile to function")
 parser.add_argument("--enable-cors-header", type=str, default=None, metavar="ORIGIN", nargs="?", const="*", help="Enable CORS (Cross-Origin Resource Sharing) with optional origin or allow all with default '*'.")
+parser.add_argument("--max-upload-size", type=float, default=100, help="Set the maximum upload size in MB.")
+
+parser.add_argument("--base-directory", type=str, default=None, help="Set the ComfyUI base directory for models, custom_nodes, input, output, temp, and user directories.")
 parser.add_argument("--extra-model-paths-config", type=str, default=None, metavar="PATH", nargs='+', action='append', help="Load one or more extra_model_paths.yaml files.")
-parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.")
+parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory. Overrides --base-directory.")
+parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory). Overrides --base-directory.")
+parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
+parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
 parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
-parser.add_argument("--dont-upcast-attention", action="store_true", help="Disable upcasting of attention. Can boost speed but increase the chances of black images.")
-parser.add_argument("--force-fp32", action="store_true", help="Force fp32 (If this makes your GPU work better please report it).")
+cm_group = parser.add_mutually_exclusive_group()
+cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
+cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
+
+
+fp_group = parser.add_mutually_exclusive_group()
+fp_group.add_argument("--force-fp32", action="store_true", help="Force fp32 (If this makes your GPU work better please report it).")
+fp_group.add_argument("--force-fp16", action="store_true", help="Force fp16.")
+
+fpunet_group = parser.add_mutually_exclusive_group()
+fpunet_group.add_argument("--fp32-unet", action="store_true", help="Run the diffusion model in fp32.")
+fpunet_group.add_argument("--fp64-unet", action="store_true", help="Run the diffusion model in fp64.")
+fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the diffusion model in bf16.")
+fpunet_group.add_argument("--fp16-unet", action="store_true", help="Run the diffusion model in fp16")
+fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
+fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")
+fpunet_group.add_argument("--fp8_e8m0fnu-unet", action="store_true", help="Store unet weights in fp8_e8m0fnu.")
+
+fpvae_group = parser.add_mutually_exclusive_group()
+fpvae_group.add_argument("--fp16-vae", action="store_true", help="Run the VAE in fp16, might cause black images.")
+fpvae_group.add_argument("--fp32-vae", action="store_true", help="Run the VAE in full precision fp32.")
+fpvae_group.add_argument("--bf16-vae", action="store_true", help="Run the VAE in bf16.")
+
+parser.add_argument("--cpu-vae", action="store_true", help="Run the VAE on the CPU.")
+
+fpte_group = parser.add_mutually_exclusive_group()
+fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Store text encoder weights in fp8 (e4m3fn variant).")
+fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
+fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
+fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
+fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")
+
+parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")
+
 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")

+parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
+parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
+parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
+
+class LatentPreviewMethod(enum.Enum):
+    NoPreviews = "none"
+    Auto = "auto"
+    Latent2RGB = "latent2rgb"
+    TAESD = "taesd"
+
+parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)
+
+parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
+
+cache_group = parser.add_mutually_exclusive_group()
+cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
+cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
+cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
+
 attn_group = parser.add_mutually_exclusive_group()
-attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization instead of the sub-quadratic one. Ignored when xformers is used.")
+attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
+attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
 attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
+attn_group.add_argument("--use-sage-attention", action="store_true", help="Use sage attention.")
+attn_group.add_argument("--use-flash-attention", action="store_true", help="Use FlashAttention.")

 parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")

+upcast = parser.add_mutually_exclusive_group()
+upcast.add_argument("--force-upcast-attention", action="store_true", help="Force enable attention upcasting, please report if it fixes black images.")
+upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")
+
+
 vram_group = parser.add_mutually_exclusive_group()
+vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
 vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
 vram_group.add_argument("--normalvram", action="store_true", help="Used to force normal vram use if lowvram gets automatically enabled.")
 vram_group.add_argument("--lowvram", action="store_true", help="Split the unet in parts to use less vram.")
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")

+parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
+
+parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")
+
+parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")
+
+parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
+parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
+
+class PerformanceFeature(enum.Enum):
+    Fp16Accumulation = "fp16_accumulation"
+    Fp8MatrixMultiplication = "fp8_matrix_mult"
+    CublasOps = "cublas_ops"
+
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
+
+parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
+
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
 parser.add_argument("--windows-standalone-build", action="store_true", help="Windows standalone build: Enable convenient things that most people using the standalone windows build will probably enjoy (like auto opening the page on startup).")

-args = parser.parse_args()
+parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
+parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
+parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")
+
+parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
+
+parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
+parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")
+
+# The default built-in provider hosted under web/
+DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
+
+parser.add_argument(
+    "--front-end-version",
+    type=str,
+    default=DEFAULT_VERSION_STRING,
+    help="""
+    Specifies the version of the frontend to be used. This command needs internet connectivity to query and
+    download available frontend implementations from GitHub releases.
+
+    The version string should be in the format of:
+    [repoOwner]/[repoName]@[version]
+    where version is one of: "latest" or a valid version number (e.g. "1.0.0")
+    """,
+)
+
+def is_valid_directory(path: str) -> str:
+    """Validate if the given path is a directory, and check permissions."""
+    if not os.path.exists(path):
+        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
+    if not os.path.isdir(path):
+        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
+    if not os.access(path, os.R_OK):
+        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
+    return path
+
+parser.add_argument(
+    "--front-end-root",
+    type=is_valid_directory,
+    default=None,
+    help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.",
+)
+
+parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path. Overrides --base-directory.")
+
+parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")
+
+parser.add_argument(
+    "--comfy-api-base",
+    type=str,
+    default="https://api.comfy.org",
+    help="Set the base URL for the ComfyUI API.  (default: https://api.comfy.org)",
+)
+
+if comfy.options.args_parsing:
+    args = parser.parse_args()
+else:
+    args = parser.parse_args([])

 if args.windows_standalone_build:
    args.auto_launch = True
+
+if args.disable_auto_launch:
+    args.auto_launch = False
+
+if args.force_fp16:
+    args.fp16_unet = True
+
+
+# '--fast' is not provided, use an empty set
+if args.fast is None:
+    args.fast = set()
+# '--fast' is provided with an empty list, enable all optimizations
+elif args.fast == []:
+    args.fast = set(PerformanceFeature)
+# '--fast' is provided with a list of performance features, use that list
+else:
+    args.fast = set(args.fast)
--- a/comfy/clip_config_bigg.json
+++ b/comfy/clip_config_bigg.json
@@ -0,0 +1,23 @@
+{
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 49407,
+  "hidden_act": "gelu",
+  "hidden_size": 1280,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 20,
+  "num_hidden_layers": 32,
+  "pad_token_id": 1,
+  "projection_dim": 1280,
+  "torch_dtype": "float32",
+  "vocab_size": 49408
+}
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -0,0 +1,244 @@
+import torch
+from comfy.ldm.modules.attention import optimized_attention_for_device
+import comfy.ops
+
+class CLIPAttention(torch.nn.Module):
+    def __init__(self, embed_dim, heads, dtype, device, operations):
+        super().__init__()
+
+        self.heads = heads
+        self.q_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.k_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.v_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+
+        self.out_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x, mask=None, optimized_attention=None):
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        out = optimized_attention(q, k, v, self.heads, mask)
+        return self.out_proj(out)
+
+ACTIVATIONS = {"quick_gelu": lambda a: a * torch.sigmoid(1.702 * a),
+               "gelu": torch.nn.functional.gelu,
+               "gelu_pytorch_tanh": lambda a: torch.nn.functional.gelu(a, approximate="tanh"),
+}
+
+class CLIPMLP(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_size, activation, dtype, device, operations):
+        super().__init__()
+        self.fc1 = operations.Linear(embed_dim, intermediate_size, bias=True, dtype=dtype, device=device)
+        self.activation = ACTIVATIONS[activation]
+        self.fc2 = operations.Linear(intermediate_size, embed_dim, bias=True, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.activation(x)
+        x = self.fc2(x)
+        return x
+
+class CLIPLayer(torch.nn.Module):
+    def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations):
+        super().__init__()
+        self.layer_norm1 = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
+        self.self_attn = CLIPAttention(embed_dim, heads, dtype, device, operations)
+        self.layer_norm2 = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
+        self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device, operations)
+
+    def forward(self, x, mask=None, optimized_attention=None):
+        x += self.self_attn(self.layer_norm1(x), mask, optimized_attention)
+        x += self.mlp(self.layer_norm2(x))
+        return x
+
+
+class CLIPEncoder(torch.nn.Module):
+    def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations):
+        super().__init__()
+        self.layers = torch.nn.ModuleList([CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations) for i in range(num_layers)])
+
+    def forward(self, x, mask=None, intermediate_output=None):
+        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
+
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layers) + intermediate_output
+
+        intermediate = None
+        for i, l in enumerate(self.layers):
+            x = l(x, mask, optimized_attention)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+class CLIPEmbeddings(torch.nn.Module):
+    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.token_embedding = operations.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+
+    def forward(self, input_tokens, dtype=torch.float32):
+        return self.token_embedding(input_tokens, out_dtype=dtype) + comfy.ops.cast_to(self.position_embedding.weight, dtype=dtype, device=input_tokens.device)
+
+
+class CLIPTextModel_(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        num_layers = config_dict["num_hidden_layers"]
+        embed_dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        intermediate_size = config_dict["intermediate_size"]
+        intermediate_activation = config_dict["hidden_act"]
+        num_positions = config_dict["max_position_embeddings"]
+        self.eos_token_id = config_dict["eos_token_id"]
+
+        super().__init__()
+        self.embeddings = CLIPEmbeddings(embed_dim, num_positions=num_positions, dtype=dtype, device=device, operations=operations)
+        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
+        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
+
+    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+        if embeds is not None:
+            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
+        else:
+            x = self.embeddings(input_tokens, dtype=dtype)
+
+        mask = None
+        if attention_mask is not None:
+            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+            mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
+
+        causal_mask = torch.full((x.shape[1], x.shape[1]), -torch.finfo(x.dtype).max, dtype=x.dtype, device=x.device).triu_(1)
+
+        if mask is not None:
+            mask += causal_mask
+        else:
+            mask = causal_mask
+
+        x, i = self.encoder(x, mask=mask, intermediate_output=intermediate_output)
+        x = self.final_layer_norm(x)
+        if i is not None and final_layer_norm_intermediate:
+            i = self.final_layer_norm(i)
+
+        if num_tokens is not None:
+            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
+        else:
+            pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
+        return x, i, pooled_output
+
+class CLIPTextModel(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        self.num_layers = config_dict["num_hidden_layers"]
+        self.text_model = CLIPTextModel_(config_dict, dtype, device, operations)
+        embed_dim = config_dict["hidden_size"]
+        self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
+        self.dtype = dtype
+
+    def get_input_embeddings(self):
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, embeddings):
+        self.text_model.embeddings.token_embedding = embeddings
+
+    def forward(self, *args, **kwargs):
+        x = self.text_model(*args, **kwargs)
+        out = self.text_projection(x[2])
+        return (x[0], x[1], out, x[2])
+
+
+class CLIPVisionEmbeddings(torch.nn.Module):
+    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, model_type="", dtype=None, device=None, operations=None):
+        super().__init__()
+
+        num_patches = (image_size // patch_size) ** 2
+        if model_type == "siglip_vision_model":
+            self.class_embedding = None
+            patch_bias = True
+        else:
+            num_patches = num_patches + 1
+            self.class_embedding = torch.nn.Parameter(torch.empty(embed_dim, dtype=dtype, device=device))
+            patch_bias = False
+
+        self.patch_embedding = operations.Conv2d(
+            in_channels=num_channels,
+            out_channels=embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=patch_bias,
+            dtype=dtype,
+            device=device
+        )
+
+        self.position_embedding = operations.Embedding(num_patches, embed_dim, dtype=dtype, device=device)
+
+    def forward(self, pixel_values):
+        embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
+        if self.class_embedding is not None:
+            embeds = torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1)
+        return embeds + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)
+
+
+class CLIPVision(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        num_layers = config_dict["num_hidden_layers"]
+        embed_dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        intermediate_size = config_dict["intermediate_size"]
+        intermediate_activation = config_dict["hidden_act"]
+        model_type = config_dict["model_type"]
+
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
+        if model_type == "siglip_vision_model":
+            self.pre_layrnorm = lambda a: a
+            self.output_layernorm = True
+        else:
+            self.pre_layrnorm = operations.LayerNorm(embed_dim)
+            self.output_layernorm = False
+        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
+        self.post_layernorm = operations.LayerNorm(embed_dim)
+
+    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
+        x = self.embeddings(pixel_values)
+        x = self.pre_layrnorm(x)
+        #TODO: attention_mask?
+        x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
+        if self.output_layernorm:
+            x = self.post_layernorm(x)
+            pooled_output = x
+        else:
+            pooled_output = self.post_layernorm(x[:, 0, :])
+        return x, i, pooled_output
+
+class LlavaProjector(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, dtype, device, operations):
+        super().__init__()
+        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
+        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
+
+class CLIPVisionModelProjection(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        self.vision_model = CLIPVision(config_dict, dtype, device, operations)
+        if "projection_dim" in config_dict:
+            self.visual_projection = operations.Linear(config_dict["hidden_size"], config_dict["projection_dim"], bias=False)
+        else:
+            self.visual_projection = lambda a: a
+
+        if "llava3" == config_dict.get("projector_type", None):
+            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
+        else:
+            self.multi_modal_projector = None
+
+    def forward(self, *args, **kwargs):
+        x = self.vision_model(*args, **kwargs)
+        out = self.visual_projection(x[2])
+        projected = None
+        if self.multi_modal_projector is not None:
+            projected = self.multi_modal_projector(x[1])
+
+        return (x[0], x[1], out, projected)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -1,64 +1,148 @@
-from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPImageProcessor
-from .utils import load_torch_file, transformers_convert
+from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace
 import os
 import torch
+import json
+import logging
+
+import comfy.ops
+import comfy.model_patcher
+import comfy.model_management
+import comfy.utils
+import comfy.clip_model
+import comfy.image_encoders.dino2
+
+class Output:
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, item):
+        setattr(self, key, item)
+
+def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
+    image = image[:, :, :, :3] if image.shape[3] > 3 else image
+    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
+    std = torch.tensor(std, device=image.device, dtype=image.dtype)
+    image = image.movedim(-1, 1)
+    if not (image.shape[2] == size and image.shape[3] == size):
+        if crop:
+            scale = (size / min(image.shape[2], image.shape[3]))
+            scale_size = (round(scale * image.shape[2]), round(scale * image.shape[3]))
+        else:
+            scale_size = (size, size)
+
+        image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
+        h = (image.shape[2] - size)//2
+        w = (image.shape[3] - size)//2
+        image = image[:,:,h:h+size,w:w+size]
+    image = torch.clip((255. * image), 0, 255).round() / 255.0
+    return (image - mean.view([3,1,1])) / std.view([3,1,1])
+
+IMAGE_ENCODERS = {
+    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
+    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
+    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
+}

 class ClipVisionModel():
    def __init__(self, json_config):
-        config = CLIPVisionConfig.from_json_file(json_config)
-        self.model = CLIPVisionModelWithProjection(config)
-        self.processor = CLIPImageProcessor(crop_size=224,
-                                            do_center_crop=True,
-                                            do_convert_rgb=True,
-                                            do_normalize=True,
-                                            do_resize=True,
-                                            image_mean=[ 0.48145466,0.4578275,0.40821073],
-                                            image_std=[0.26862954,0.26130258,0.27577711],
-                                            resample=3, #bicubic
-                                            size=224)
+        with open(json_config) as f:
+            config = json.load(f)
+
+        self.image_size = config.get("image_size", 224)
+        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
+        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
+        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
+        self.load_device = comfy.model_management.text_encoder_device()
+        offload_device = comfy.model_management.text_encoder_offload_device()
+        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
+        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
+        self.model.eval()
+
+        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)

    def load_sd(self, sd):
-        self.model.load_state_dict(sd, strict=False)
+        return self.model.load_state_dict(sd, strict=False)

-    def encode_image(self, image):
-        img = torch.clip((255. * image[0]), 0, 255).round().int()
-        inputs = self.processor(images=[img], return_tensors="pt")
-        outputs = self.model(**inputs)
+    def get_sd(self):
+        return self.model.state_dict()
+
+    def encode_image(self, image, crop=True):
+        comfy.model_management.load_model_gpu(self.patcher)
+        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
+        out = self.model(pixel_values=pixel_values, intermediate_output=-2)
+
+        outputs = Output()
+        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
+        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
+        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
+        outputs["mm_projected"] = out[3]
        return outputs

-def convert_to_transformers(sd):
+def convert_to_transformers(sd, prefix):
    sd_k = sd.keys()
-    if "embedder.model.visual.transformer.resblocks.0.attn.in_proj_weight" in sd_k:
+    if "{}transformer.resblocks.0.attn.in_proj_weight".format(prefix) in sd_k:
        keys_to_replace = {
-            "embedder.model.visual.class_embedding": "vision_model.embeddings.class_embedding",
-            "embedder.model.visual.conv1.weight": "vision_model.embeddings.patch_embedding.weight",
-            "embedder.model.visual.positional_embedding": "vision_model.embeddings.position_embedding.weight",
-            "embedder.model.visual.ln_post.bias": "vision_model.post_layernorm.bias",
-            "embedder.model.visual.ln_post.weight": "vision_model.post_layernorm.weight",
-            "embedder.model.visual.ln_pre.bias": "vision_model.pre_layrnorm.bias",
-            "embedder.model.visual.ln_pre.weight": "vision_model.pre_layrnorm.weight",
+            "{}class_embedding".format(prefix): "vision_model.embeddings.class_embedding",
+            "{}conv1.weight".format(prefix): "vision_model.embeddings.patch_embedding.weight",
+            "{}positional_embedding".format(prefix): "vision_model.embeddings.position_embedding.weight",
+            "{}ln_post.bias".format(prefix): "vision_model.post_layernorm.bias",
+            "{}ln_post.weight".format(prefix): "vision_model.post_layernorm.weight",
+            "{}ln_pre.bias".format(prefix): "vision_model.pre_layrnorm.bias",
+            "{}ln_pre.weight".format(prefix): "vision_model.pre_layrnorm.weight",
        }

        for x in keys_to_replace:
            if x in sd_k:
                sd[keys_to_replace[x]] = sd.pop(x)

-        if "embedder.model.visual.proj" in sd_k:
-            sd['visual_projection.weight'] = sd.pop("embedder.model.visual.proj").transpose(0, 1)
+        if "{}proj".format(prefix) in sd_k:
+            sd['visual_projection.weight'] = sd.pop("{}proj".format(prefix)).transpose(0, 1)

-        sd = transformers_convert(sd, "embedder.model.visual", "vision_model", 32)
+        sd = transformers_convert(sd, prefix, "vision_model.", 48)
+    else:
+        replace_prefix = {prefix: ""}
+        sd = state_dict_prefix_replace(sd, replace_prefix)
    return sd

-def load_clipvision_from_sd(sd):
-    sd = convert_to_transformers(sd)
-    if "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
+def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
+    if convert_keys:
+        sd = convert_to_transformers(sd, prefix)
+    if "vision_model.encoder.layers.47.layer_norm1.weight" in sd:
+        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
+    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
+    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
+        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
+        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
+            if embed_shape == 729:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+            elif embed_shape == 1024:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
+        elif embed_shape == 577:
+            if "multi_modal_projector.linear_1.bias" in sd:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
+            else:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+        else:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
+    elif "embeddings.patch_embeddings.projection.weight" in sd:
+        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
    else:
-        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
+        return None
+
    clip = ClipVisionModel(json_config)
-    clip.load_sd(sd)
+    m, u = clip.load_sd(sd)
+    if len(m) > 0:
+        logging.warning("missing clip vision: {}".format(m))
+    u = set(u)
+    keys = list(sd.keys())
+    for k in keys:
+        if k not in u:
+            sd.pop(k)
    return clip

 def load(ckpt_path):
    sd = load_torch_file(ckpt_path)
-    return load_clipvision_from_sd(sd)
+    if "visual.transformer.resblocks.0.attn.in_proj_weight" in sd:
+        return load_clipvision_from_sd(sd, prefix="visual.", convert_keys=True)
+    else:
+        return load_clipvision_from_sd(sd)
--- a/comfy/clip_vision_config_g.json
+++ b/comfy/clip_vision_config_g.json
@@ -0,0 +1,18 @@
+{
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1664,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 48,
+  "patch_size": 14,
+  "projection_dim": 1280,
+  "torch_dtype": "float32"
+}
--- a/comfy/clip_vision_config_vitl_336.json
+++ b/comfy/clip_vision_config_vitl_336.json
@@ -0,0 +1,18 @@
+{
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 336,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-5,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "torch_dtype": "float32"
+}
--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@@ -0,0 +1,19 @@
+{
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 336,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-5,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "projector_type": "llava3",
+  "torch_dtype": "float32"
+}
--- a/comfy/clip_vision_siglip_384.json
+++ b/comfy/clip_vision_siglip_384.json
@@ -0,0 +1,13 @@
+{
+  "num_channels": 3,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 384,
+  "intermediate_size": 4304,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 27,
+  "patch_size": 14,
+  "image_mean": [0.5, 0.5, 0.5],
+  "image_std": [0.5, 0.5, 0.5]
+}
--- a/comfy/clip_vision_siglip_512.json
+++ b/comfy/clip_vision_siglip_512.json
@@ -0,0 +1,13 @@
+{
+  "num_channels": 3,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 512,
+  "intermediate_size": 4304,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 27,
+  "patch_size": 16,
+  "image_mean": [0.5, 0.5, 0.5],
+  "image_std": [0.5, 0.5, 0.5]
+}
--- a/comfy/comfy_types/README.md
+++ b/comfy/comfy_types/README.md
@@ -0,0 +1,43 @@
+# Comfy Typing
+## Type hinting for ComfyUI Node development
+
+This module provides type hinting and concrete convenience types for node developers.
+If cloned to the custom_nodes directory of ComfyUI, types can be imported using:
+
+```python
+from comfy.comfy_types import IO, ComfyNodeABC, CheckLazyMixin
+
+class ExampleNode(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(s) -> InputTypeDict:
+        return {"required": {}}
+```
+
+Full example is in [examples/example_nodes.py](examples/example_nodes.py).
+
+# Types
+A few primary types are documented below.  More complete information is available via the docstrings on each type.
+
+## `IO`
+
+A string enum of built-in and a few custom data types.  Includes the following special types and their requisite plumbing:
+
+- `ANY`: `"*"`
+- `NUMBER`: `"FLOAT,INT"`
+- `PRIMITIVE`: `"STRING,FLOAT,INT,BOOLEAN"`
+
+## `ComfyNodeABC`
+
+An abstract base class for nodes, offering type-hinting / autocomplete, and somewhat-alright docstrings.
+
+### Type hinting for `INPUT_TYPES`
+
+![INPUT_TYPES auto-completion in Visual Studio Code](examples/input_types.png)
+
+### `INPUT_TYPES` return dict
+
+![INPUT_TYPES return value type hinting in Visual Studio Code](examples/required_hint.png)
+
+### Options for individual inputs
+
+![INPUT_TYPES return value option auto-completion in Visual Studio Code](examples/input_options.png)
--- a/comfy/comfy_types/init.py
+++ b/comfy/comfy_types/init.py
@@ -0,0 +1,46 @@
+import torch
+from typing import Callable, Protocol, TypedDict, Optional, List
+from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin, FileLocator
+
+
+class UnetApplyFunction(Protocol):
+    """Function signature protocol on comfy.model_base.BaseModel.apply_model"""
+
+    def __call__(self, x: torch.Tensor, t: torch.Tensor, **kwargs) -> torch.Tensor:
+        pass
+
+
+class UnetApplyConds(TypedDict):
+    """Optional conditions for unet apply function."""
+
+    c_concat: Optional[torch.Tensor]
+    c_crossattn: Optional[torch.Tensor]
+    control: Optional[torch.Tensor]
+    transformer_options: Optional[dict]
+
+
+class UnetParams(TypedDict):
+    # Tensor of shape [B, C, H, W]
+    input: torch.Tensor
+    # Tensor of shape [B]
+    timestep: torch.Tensor
+    c: UnetApplyConds
+    # List of [0, 1], [0], [1], ...
+    # 0 means conditional, 1 means conditional unconditional
+    cond_or_uncond: List[int]
+
+
+UnetWrapperFunction = Callable[[UnetApplyFunction, UnetParams], torch.Tensor]
+
+
+__all__ = [
+    "UnetWrapperFunction",
+    UnetApplyConds.__name__,
+    UnetParams.__name__,
+    UnetApplyFunction.__name__,
+    IO.__name__,
+    InputTypeDict.__name__,
+    ComfyNodeABC.__name__,
+    CheckLazyMixin.__name__,
+    FileLocator.__name__,
+]
--- a/comfy/comfy_types/examples/example_nodes.py
+++ b/comfy/comfy_types/examples/example_nodes.py
@@ -0,0 +1,28 @@
+from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict
+from inspect import cleandoc
+
+
+class ExampleNode(ComfyNodeABC):
+    """An example node that just adds 1 to an input integer.
+
+    * Requires a modern IDE to provide any benefit (detail: an IDE configured with analysis paths etc).
+    * This node is intended as an example for developers only.
+    """
+
+    DESCRIPTION = cleandoc(__doc__)
+    CATEGORY = "examples"
+
+    @classmethod
+    def INPUT_TYPES(s) -> InputTypeDict:
+        return {
+            "required": {
+                "input_int": (IO.INT, {"defaultInput": True}),
+            }
+        }
+
+    RETURN_TYPES = (IO.INT,)
+    RETURN_NAMES = ("input_plus_one",)
+    FUNCTION = "execute"
+
+    def execute(self, input_int: int):
+        return (input_int + 1,)
--- a/comfy/comfy_types/examples/input_options.png
+++ b/comfy/comfy_types/examples/input_options.png
--- a/comfy/comfy_types/examples/input_types.png
+++ b/comfy/comfy_types/examples/input_types.png
--- a/comfy/comfy_types/examples/required_hint.png
+++ b/comfy/comfy_types/examples/required_hint.png
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@@ -0,0 +1,348 @@
+"""Comfy-specific type hinting"""
+
+from __future__ import annotations
+from typing import Literal, TypedDict, Optional
+from typing_extensions import NotRequired
+from abc import ABC, abstractmethod
+from enum import Enum
+
+
+class StrEnum(str, Enum):
+    """Base class for string enums. Python's StrEnum is not available until 3.11."""
+
+    def __str__(self) -> str:
+        return self.value
+
+
+class IO(StrEnum):
+    """Node input/output data types.
+
+    Includes functionality for ``"*"`` (`ANY`) and ``"MULTI,TYPES"``.
+    """
+
+    STRING = "STRING"
+    IMAGE = "IMAGE"
+    MASK = "MASK"
+    LATENT = "LATENT"
+    BOOLEAN = "BOOLEAN"
+    INT = "INT"
+    FLOAT = "FLOAT"
+    COMBO = "COMBO"
+    CONDITIONING = "CONDITIONING"
+    SAMPLER = "SAMPLER"
+    SIGMAS = "SIGMAS"
+    GUIDER = "GUIDER"
+    NOISE = "NOISE"
+    CLIP = "CLIP"
+    CONTROL_NET = "CONTROL_NET"
+    VAE = "VAE"
+    MODEL = "MODEL"
+    CLIP_VISION = "CLIP_VISION"
+    CLIP_VISION_OUTPUT = "CLIP_VISION_OUTPUT"
+    STYLE_MODEL = "STYLE_MODEL"
+    GLIGEN = "GLIGEN"
+    UPSCALE_MODEL = "UPSCALE_MODEL"
+    AUDIO = "AUDIO"
+    WEBCAM = "WEBCAM"
+    POINT = "POINT"
+    FACE_ANALYSIS = "FACE_ANALYSIS"
+    BBOX = "BBOX"
+    SEGS = "SEGS"
+    VIDEO = "VIDEO"
+
+    ANY = "*"
+    """Always matches any type, but at a price.
+
+    Causes some functionality issues (e.g. reroutes, link types), and should be avoided whenever possible.
+    """
+    NUMBER = "FLOAT,INT"
+    """A float or an int - could be either"""
+    PRIMITIVE = "STRING,FLOAT,INT,BOOLEAN"
+    """Could be any of: string, float, int, or bool"""
+
+    def __ne__(self, value: object) -> bool:
+        if self == "*" or value == "*":
+            return False
+        if not isinstance(value, str):
+            return True
+        a = frozenset(self.split(","))
+        b = frozenset(value.split(","))
+        return not (b.issubset(a) or a.issubset(b))
+
+
+class RemoteInputOptions(TypedDict):
+    route: str
+    """The route to the remote source."""
+    refresh_button: bool
+    """Specifies whether to show a refresh button in the UI below the widget."""
+    control_after_refresh: Literal["first", "last"]
+    """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on."""
+    timeout: int
+    """The maximum amount of time to wait for a response from the remote source in milliseconds."""
+    max_retries: int
+    """The maximum number of retries before aborting the request."""
+    refresh: int
+    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
+
+
+class MultiSelectOptions(TypedDict):
+    placeholder: NotRequired[str]
+    """The placeholder text to display in the multi-select widget when no items are selected."""
+    chip: NotRequired[bool]
+    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
+
+
+class InputTypeOptions(TypedDict):
+    """Provides type hinting for the return type of the INPUT_TYPES node function.
+
+    Due to IDE limitations with unions, for now all options are available for all types (e.g. `label_on` is hinted even when the type is not `IO.BOOLEAN`).
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
+    """
+
+    default: NotRequired[bool | str | float | int | list | tuple]
+    """The default value of the widget"""
+    defaultInput: NotRequired[bool]
+    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
+    - defaultInput on required inputs should be dropped.
+    - defaultInput on optional inputs should be replaced with forceInput.
+    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
+    """
+    forceInput: NotRequired[bool]
+    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
+    lazy: NotRequired[bool]
+    """Declares that this input uses lazy evaluation"""
+    rawLink: NotRequired[bool]
+    """When a link exists, rather than receiving the evaluated value, you will receive the link (i.e. `["nodeId", <outputIndex>]`). Designed for node expansion."""
+    tooltip: NotRequired[str]
+    """Tooltip for the input (or widget), shown on pointer hover"""
+    socketless: NotRequired[bool]
+    """All inputs (including widgets) have an input socket to connect links. When ``true``, if there is a widget for this input, no socket will be created.
+    Available from frontend v1.17.5
+    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3548
+    """
+    widgetType: NotRequired[str]
+    """Specifies a type to be used for widget initialization if different from the input type.
+    Available from frontend v1.18.0
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/3550"""
+    # class InputTypeNumber(InputTypeOptions):
+    # default: float | int
+    min: NotRequired[float]
+    """The minimum value of a number (``FLOAT`` | ``INT``)"""
+    max: NotRequired[float]
+    """The maximum value of a number (``FLOAT`` | ``INT``)"""
+    step: NotRequired[float]
+    """The amount to increment or decrement a widget by when stepping up/down (``FLOAT`` | ``INT``)"""
+    round: NotRequired[float]
+    """Floats are rounded by this value (``FLOAT``)"""
+    # class InputTypeBoolean(InputTypeOptions):
+    # default: bool
+    label_on: NotRequired[str]
+    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
+    label_off: NotRequired[str]
+    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
+    # class InputTypeString(InputTypeOptions):
+    # default: str
+    multiline: NotRequired[bool]
+    """Use a multiline text box (``STRING``)"""
+    placeholder: NotRequired[str]
+    """Placeholder text to display in the UI when empty (``STRING``)"""
+    # Deprecated:
+    # defaultVal: str
+    dynamicPrompts: NotRequired[bool]
+    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
+    # class InputTypeCombo(InputTypeOptions):
+    image_upload: NotRequired[bool]
+    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
+    image_folder: NotRequired[Literal["input", "output", "temp"]]
+    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
+    """
+    remote: NotRequired[RemoteInputOptions]
+    """Specifies the configuration for a remote input.
+    Available after ComfyUI frontend v1.9.7
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
+    control_after_generate: NotRequired[bool]
+    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
+    options: NotRequired[list[str | int | float]]
+    """COMBO type only. Specifies the selectable options for the combo widget.
+    Prefer:
+    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
+    Over:
+    [["Option 1", "Option 2", "Option 3"]]
+    """
+    multi_select: NotRequired[MultiSelectOptions]
+    """COMBO type only. Specifies the configuration for a multi-select widget.
+    Available after ComfyUI frontend v1.13.4
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""
+
+
+class HiddenInputTypeDict(TypedDict):
+    """Provides type hinting for the hidden entry of node INPUT_TYPES."""
+
+    node_id: NotRequired[Literal["UNIQUE_ID"]]
+    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
+    unique_id: NotRequired[Literal["UNIQUE_ID"]]
+    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
+    prompt: NotRequired[Literal["PROMPT"]]
+    """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description."""
+    extra_pnginfo: NotRequired[Literal["EXTRA_PNGINFO"]]
+    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
+    dynprompt: NotRequired[Literal["DYNPROMPT"]]
+    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""
+
+
+class InputTypeDict(TypedDict):
+    """Provides type hinting for node INPUT_TYPES.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
+    """
+
+    required: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    """Describes all inputs that must be connected for the node to execute."""
+    optional: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    """Describes inputs which do not need to be connected."""
+    hidden: NotRequired[HiddenInputTypeDict]
+    """Offers advanced functionality and server-client communication.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+    """
+
+
+class ComfyNodeABC(ABC):
+    """Abstract base class for Comfy nodes.  Includes the names and expected types of attributes.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview
+    """
+
+    DESCRIPTION: str
+    """Node description, shown as a tooltip when hovering over the node.
+
+    Usage::
+
+        # Explicitly define the description
+        DESCRIPTION = "Example description here."
+
+        # Use the docstring of the node class.
+        DESCRIPTION = cleandoc(__doc__)
+    """
+    CATEGORY: str
+    """The category of the node, as per the "Add Node" menu.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#category
+    """
+    EXPERIMENTAL: bool
+    """Flags a node as experimental, informing users that it may change or not work as expected."""
+    DEPRECATED: bool
+    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
+    API_NODE: Optional[bool]
+    """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview."""
+
+    @classmethod
+    @abstractmethod
+    def INPUT_TYPES(s) -> InputTypeDict:
+        """Defines node inputs.
+
+        * Must include the ``required`` key, which describes all inputs that must be connected for the node to execute.
+        * The ``optional`` key can be added to describe inputs which do not need to be connected.
+        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+
+        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#input-types
+        """
+        return {"required": {}}
+
+    OUTPUT_NODE: bool
+    """Flags this node as an output node, causing any inputs it requires to be executed.
+
+    If a node is not connected to any output nodes, that node will not be executed.  Usage::
+
+        OUTPUT_NODE = True
+
+    From the docs:
+
+    By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node
+    """
+    INPUT_IS_LIST: bool
+    """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.
+
+    All inputs of ``type`` will become ``list[type]``, regardless of how many items are passed in.  This also affects ``check_lazy_status``.
+
+    From the docs:
+
+    A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    """
+    OUTPUT_IS_LIST: tuple[bool, ...]
+    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.
+
+    Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list.
+
+    A ``tuple[bool]``, where the items match those in `RETURN_TYPES`::
+
+        RETURN_TYPES = (IO.INT, IO.INT, IO.STRING)
+        OUTPUT_IS_LIST = (True, True, False) # The string output will be handled normally
+
+    From the docs:
+
+    In order to tell Comfy that the list being returned should not be wrapped, but treated as a series of data for sequential processing,
+    the node should provide a class attribute `OUTPUT_IS_LIST`, which is a ``tuple[bool]``, of the same length as `RETURN_TYPES`,
+    specifying which outputs which should be so treated.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    """
+
+    RETURN_TYPES: tuple[IO, ...]
+    """A tuple representing the outputs of this node.
+
+    Usage::
+
+        RETURN_TYPES = (IO.INT, "INT", "CUSTOM_TYPE")
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
+    """
+    RETURN_NAMES: tuple[str, ...]
+    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
+    """
+    OUTPUT_TOOLTIPS: tuple[str, ...]
+    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
+    FUNCTION: str
+    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#function
+    """
+
+
+class CheckLazyMixin:
+    """Provides a basic check_lazy_status implementation and type hinting for nodes that use lazy inputs."""
+
+    def check_lazy_status(self, **kwargs) -> list[str]:
+        """Returns a list of input names that should be evaluated.
+
+        This basic mixin impl. requires all inputs.
+
+        :kwargs: All node inputs will be included here.  If the input is ``None``, it should be assumed that it has not yet been evaluated.  \
+            When using ``INPUT_IS_LIST = True``, unevaluated will instead be ``(None,)``.
+
+        Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name).
+        Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params).
+
+        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status
+        """
+
+        need = [name for name in kwargs if kwargs[name] is None]
+        return need
+
+
+class FileLocator(TypedDict):
+    """Provides type hinting for the file location"""
+
+    filename: str
+    """The filename of the file."""
+    subfolder: str
+    """The subfolder of the file."""
+    type: Literal["input", "output", "temp"]
+    """The root folder of the file."""
--- a/comfy/conds.py
+++ b/comfy/conds.py
@@ -0,0 +1,88 @@
+import torch
+import math
+import comfy.utils
+
+
+class CONDRegular:
+    def __init__(self, cond):
+        self.cond = cond
+
+    def _copy_with(self, cond):
+        return self.__class__(cond)
+
+    def process_cond(self, batch_size, device, **kwargs):
+        return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size).to(device))
+
+    def can_concat(self, other):
+        if self.cond.shape != other.cond.shape:
+            return False
+        return True
+
+    def concat(self, others):
+        conds = [self.cond]
+        for x in others:
+            conds.append(x.cond)
+        return torch.cat(conds)
+
+    def size(self):
+        return list(self.cond.size())
+
+
+class CONDNoiseShape(CONDRegular):
+    def process_cond(self, batch_size, device, area, **kwargs):
+        data = self.cond
+        if area is not None:
+            dims = len(area) // 2
+            for i in range(dims):
+                data = data.narrow(i + 2, area[i + dims], area[i])
+
+        return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size).to(device))
+
+
+class CONDCrossAttn(CONDRegular):
+    def can_concat(self, other):
+        s1 = self.cond.shape
+        s2 = other.cond.shape
+        if s1 != s2:
+            if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
+                return False
+
+            mult_min = math.lcm(s1[1], s2[1])
+            diff = mult_min // min(s1[1], s2[1])
+            if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
+                return False
+        return True
+
+    def concat(self, others):
+        conds = [self.cond]
+        crossattn_max_len = self.cond.shape[1]
+        for x in others:
+            c = x.cond
+            crossattn_max_len = math.lcm(crossattn_max_len, c.shape[1])
+            conds.append(c)
+
+        out = []
+        for c in conds:
+            if c.shape[1] < crossattn_max_len:
+                c = c.repeat(1, crossattn_max_len // c.shape[1], 1) #padding with repeat doesn't change result
+            out.append(c)
+        return torch.cat(out)
+
+
+class CONDConstant(CONDRegular):
+    def __init__(self, cond):
+        self.cond = cond
+
+    def process_cond(self, batch_size, device, **kwargs):
+        return self._copy_with(self.cond)
+
+    def can_concat(self, other):
+        if self.cond != other.cond:
+            return False
+        return True
+
+    def concat(self, others):
+        return self.cond
+
+    def size(self):
+        return [1]
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -0,0 +1,857 @@
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+
+import torch
+from enum import Enum
+import math
+import os
+import logging
+import comfy.utils
+import comfy.model_management
+import comfy.model_detection
+import comfy.model_patcher
+import comfy.ops
+import comfy.latent_formats
+
+import comfy.cldm.cldm
+import comfy.t2i_adapter.adapter
+import comfy.ldm.cascade.controlnet
+import comfy.cldm.mmdit
+import comfy.ldm.hydit.controlnet
+import comfy.ldm.flux.controlnet
+import comfy.cldm.dit_embedder
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from comfy.hooks import HookGroup
+
+
+def broadcast_image_to(tensor, target_batch_size, batched_number):
+    current_batch_size = tensor.shape[0]
+    #print(current_batch_size, target_batch_size)
+    if current_batch_size == 1:
+        return tensor
+
+    per_batch = target_batch_size // batched_number
+    tensor = tensor[:per_batch]
+
+    if per_batch > tensor.shape[0]:
+        tensor = torch.cat([tensor] * (per_batch // tensor.shape[0]) + [tensor[:(per_batch % tensor.shape[0])]], dim=0)
+
+    current_batch_size = tensor.shape[0]
+    if current_batch_size == target_batch_size:
+        return tensor
+    else:
+        return torch.cat([tensor] * batched_number, dim=0)
+
+class StrengthType(Enum):
+    CONSTANT = 1
+    LINEAR_UP = 2
+
+class ControlBase:
+    def __init__(self):
+        self.cond_hint_original = None
+        self.cond_hint = None
+        self.strength = 1.0
+        self.timestep_percent_range = (0.0, 1.0)
+        self.latent_format = None
+        self.vae = None
+        self.global_average_pooling = False
+        self.timestep_range = None
+        self.compression_ratio = 8
+        self.upscale_algorithm = 'nearest-exact'
+        self.extra_args = {}
+        self.previous_controlnet = None
+        self.extra_conds = []
+        self.strength_type = StrengthType.CONSTANT
+        self.concat_mask = False
+        self.extra_concat_orig = []
+        self.extra_concat = None
+        self.extra_hooks: HookGroup = None
+        self.preprocess_image = lambda a: a
+
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
+        self.cond_hint_original = cond_hint
+        self.strength = strength
+        self.timestep_percent_range = timestep_percent_range
+        if self.latent_format is not None:
+            if vae is None:
+                logging.warning("WARNING: no VAE provided to the controlnet apply node when this controlnet requires one.")
+            self.vae = vae
+        self.extra_concat_orig = extra_concat.copy()
+        if self.concat_mask and len(self.extra_concat_orig) == 0:
+            self.extra_concat_orig.append(torch.tensor([[[[1.0]]]]))
+        return self
+
+    def pre_run(self, model, percent_to_timestep_function):
+        self.timestep_range = (percent_to_timestep_function(self.timestep_percent_range[0]), percent_to_timestep_function(self.timestep_percent_range[1]))
+        if self.previous_controlnet is not None:
+            self.previous_controlnet.pre_run(model, percent_to_timestep_function)
+
+    def set_previous_controlnet(self, controlnet):
+        self.previous_controlnet = controlnet
+        return self
+
+    def cleanup(self):
+        if self.previous_controlnet is not None:
+            self.previous_controlnet.cleanup()
+
+        self.cond_hint = None
+        self.extra_concat = None
+        self.timestep_range = None
+
+    def get_models(self):
+        out = []
+        if self.previous_controlnet is not None:
+            out += self.previous_controlnet.get_models()
+        return out
+
+    def get_extra_hooks(self):
+        out = []
+        if self.extra_hooks is not None:
+            out.append(self.extra_hooks)
+        if self.previous_controlnet is not None:
+            out += self.previous_controlnet.get_extra_hooks()
+        return out
+
+    def copy_to(self, c):
+        c.cond_hint_original = self.cond_hint_original
+        c.strength = self.strength
+        c.timestep_percent_range = self.timestep_percent_range
+        c.global_average_pooling = self.global_average_pooling
+        c.compression_ratio = self.compression_ratio
+        c.upscale_algorithm = self.upscale_algorithm
+        c.latent_format = self.latent_format
+        c.extra_args = self.extra_args.copy()
+        c.vae = self.vae
+        c.extra_conds = self.extra_conds.copy()
+        c.strength_type = self.strength_type
+        c.concat_mask = self.concat_mask
+        c.extra_concat_orig = self.extra_concat_orig.copy()
+        c.extra_hooks = self.extra_hooks.clone() if self.extra_hooks else None
+        c.preprocess_image = self.preprocess_image
+
+    def inference_memory_requirements(self, dtype):
+        if self.previous_controlnet is not None:
+            return self.previous_controlnet.inference_memory_requirements(dtype)
+        return 0
+
+    def control_merge(self, control, control_prev, output_dtype):
+        out = {'input':[], 'middle':[], 'output': []}
+
+        for key in control:
+            control_output = control[key]
+            applied_to = set()
+            for i in range(len(control_output)):
+                x = control_output[i]
+                if x is not None:
+                    if self.global_average_pooling:
+                        x = torch.mean(x, dim=(2, 3), keepdim=True).repeat(1, 1, x.shape[2], x.shape[3])
+
+                    if x not in applied_to: #memory saving strategy, allow shared tensors and only apply strength to shared tensors once
+                        applied_to.add(x)
+                        if self.strength_type == StrengthType.CONSTANT:
+                            x *= self.strength
+                        elif self.strength_type == StrengthType.LINEAR_UP:
+                            x *= (self.strength ** float(len(control_output) - i))
+
+                    if output_dtype is not None and x.dtype != output_dtype:
+                        x = x.to(output_dtype)
+
+                out[key].append(x)
+
+        if control_prev is not None:
+            for x in ['input', 'middle', 'output']:
+                o = out[x]
+                for i in range(len(control_prev[x])):
+                    prev_val = control_prev[x][i]
+                    if i >= len(o):
+                        o.append(prev_val)
+                    elif prev_val is not None:
+                        if o[i] is None:
+                            o[i] = prev_val
+                        else:
+                            if o[i].shape[0] < prev_val.shape[0]:
+                                o[i] = prev_val + o[i]
+                            else:
+                                o[i] = prev_val + o[i] #TODO: change back to inplace add if shared tensors stop being an issue
+        return out
+
+    def set_extra_arg(self, argument, value=None):
+        self.extra_args[argument] = value
+
+
+class ControlNet(ControlBase):
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False, preprocess_image=lambda a: a):
+        super().__init__()
+        self.control_model = control_model
+        self.load_device = load_device
+        if control_model is not None:
+            self.control_model_wrapped = comfy.model_patcher.ModelPatcher(self.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
+
+        self.compression_ratio = compression_ratio
+        self.global_average_pooling = global_average_pooling
+        self.model_sampling_current = None
+        self.manual_cast_dtype = manual_cast_dtype
+        self.latent_format = latent_format
+        self.extra_conds += extra_conds
+        self.strength_type = strength_type
+        self.concat_mask = concat_mask
+        self.preprocess_image = preprocess_image
+
+    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
+        control_prev = None
+        if self.previous_controlnet is not None:
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number, transformer_options)
+
+        if self.timestep_range is not None:
+            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
+                if control_prev is not None:
+                    return control_prev
+                else:
+                    return None
+
+        dtype = self.control_model.dtype
+        if self.manual_cast_dtype is not None:
+            dtype = self.manual_cast_dtype
+
+        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
+            if self.cond_hint is not None:
+                del self.cond_hint
+            self.cond_hint = None
+            compression_ratio = self.compression_ratio
+            if self.vae is not None:
+                compression_ratio *= self.vae.downscale_ratio
+            else:
+                if self.latent_format is not None:
+                    raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
+            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center")
+            self.cond_hint = self.preprocess_image(self.cond_hint)
+            if self.vae is not None:
+                loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
+                self.cond_hint = self.vae.encode(self.cond_hint.movedim(1, -1))
+                comfy.model_management.load_models_gpu(loaded_models)
+            if self.latent_format is not None:
+                self.cond_hint = self.latent_format.process_in(self.cond_hint)
+            if len(self.extra_concat_orig) > 0:
+                to_concat = []
+                for c in self.extra_concat_orig:
+                    c = c.to(self.cond_hint.device)
+                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
+                    to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
+                self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
+
+            self.cond_hint = self.cond_hint.to(device=x_noisy.device, dtype=dtype)
+        if x_noisy.shape[0] != self.cond_hint.shape[0]:
+            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
+
+        context = cond.get('crossattn_controlnet', cond['c_crossattn'])
+        extra = self.extra_args.copy()
+        for c in self.extra_conds:
+            temp = cond.get(c, None)
+            if temp is not None:
+                extra[c] = temp.to(dtype)
+
+        timestep = self.model_sampling_current.timestep(t)
+        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)
+
+        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra)
+        return self.control_merge(control, control_prev, output_dtype=None)
+
+    def copy(self):
+        c = ControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
+        c.control_model = self.control_model
+        c.control_model_wrapped = self.control_model_wrapped
+        self.copy_to(c)
+        return c
+
+    def get_models(self):
+        out = super().get_models()
+        out.append(self.control_model_wrapped)
+        return out
+
+    def pre_run(self, model, percent_to_timestep_function):
+        super().pre_run(model, percent_to_timestep_function)
+        self.model_sampling_current = model.model_sampling
+
+    def cleanup(self):
+        self.model_sampling_current = None
+        super().cleanup()
+
+class ControlLoraOps:
+    class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
+        def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                    device=None, dtype=None) -> None:
+            super().__init__()
+            self.in_features = in_features
+            self.out_features = out_features
+            self.weight = None
+            self.up = None
+            self.down = None
+            self.bias = None
+
+        def forward(self, input):
+            weight, bias = comfy.ops.cast_bias_weight(self, input)
+            if self.up is not None:
+                return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
+            else:
+                return torch.nn.functional.linear(input, weight, bias)
+
+    class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp):
+        def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=True,
+            padding_mode='zeros',
+            device=None,
+            dtype=None
+        ):
+            super().__init__()
+            self.in_channels = in_channels
+            self.out_channels = out_channels
+            self.kernel_size = kernel_size
+            self.stride = stride
+            self.padding = padding
+            self.dilation = dilation
+            self.transposed = False
+            self.output_padding = 0
+            self.groups = groups
+            self.padding_mode = padding_mode
+
+            self.weight = None
+            self.bias = None
+            self.up = None
+            self.down = None
+
+
+        def forward(self, input):
+            weight, bias = comfy.ops.cast_bias_weight(self, input)
+            if self.up is not None:
+                return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
+            else:
+                return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class ControlLora(ControlNet):
+    def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options
+        ControlBase.__init__(self)
+        self.control_weights = control_weights
+        self.global_average_pooling = global_average_pooling
+        self.extra_conds += ["y"]
+
+    def pre_run(self, model, percent_to_timestep_function):
+        super().pre_run(model, percent_to_timestep_function)
+        controlnet_config = model.model_config.unet_config.copy()
+        controlnet_config.pop("out_channels")
+        controlnet_config["hint_channels"] = self.control_weights["input_hint_block.0.weight"].shape[1]
+        self.manual_cast_dtype = model.manual_cast_dtype
+        dtype = model.get_dtype()
+        if self.manual_cast_dtype is None:
+            class control_lora_ops(ControlLoraOps, comfy.ops.disable_weight_init):
+                pass
+        else:
+            class control_lora_ops(ControlLoraOps, comfy.ops.manual_cast):
+                pass
+            dtype = self.manual_cast_dtype
+
+        controlnet_config["operations"] = control_lora_ops
+        controlnet_config["dtype"] = dtype
+        self.control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
+        self.control_model.to(comfy.model_management.get_torch_device())
+        diffusion_model = model.diffusion_model
+        sd = diffusion_model.state_dict()
+
+        for k in sd:
+            weight = sd[k]
+            try:
+                comfy.utils.set_attr_param(self.control_model, k, weight)
+            except:
+                pass
+
+        for k in self.control_weights:
+            if k not in {"lora_controlnet"}:
+                comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device()))
+
+    def copy(self):
+        c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling)
+        self.copy_to(c)
+        return c
+
+    def cleanup(self):
+        del self.control_model
+        self.control_model = None
+        super().cleanup()
+
+    def get_models(self):
+        out = ControlBase.get_models(self)
+        return out
+
+    def inference_memory_requirements(self, dtype):
+        return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)
+
+def controlnet_config(sd, model_options={}):
+    model_config = comfy.model_detection.model_config_from_unet(sd, "", True)
+
+    unet_dtype = model_options.get("dtype", None)
+    if unet_dtype is None:
+        weight_dtype = comfy.utils.weight_dtype(sd)
+
+        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+
+    load_device = comfy.model_management.get_torch_device()
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
+
+    offload_device = comfy.model_management.unet_offload_device()
+    return model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device
+
+def controlnet_load_state_dict(control_model, sd):
+    missing, unexpected = control_model.load_state_dict(sd, strict=False)
+
+    if len(missing) > 0:
+        logging.warning("missing controlnet keys: {}".format(missing))
+
+    if len(unexpected) > 0:
+        logging.debug("unexpected controlnet keys: {}".format(unexpected))
+    return control_model
+
+
+def load_controlnet_mmdit(sd, model_options={}):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
+    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
+    for k in sd:
+        new_sd[k] = sd[k]
+
+    concat_mask = False
+    control_latent_channels = new_sd.get("pos_embed_input.proj.weight").shape[1]
+    if control_latent_channels == 17: #inpaint controlnet
+        concat_mask = True
+
+    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, new_sd)
+
+    latent_format = comfy.latent_formats.SD3()
+    latent_format.shift_factor = 0 #SD3 controlnet weirdness
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
+    return control
+
+
+class ControlNetSD35(ControlNet):
+    def pre_run(self, model, percent_to_timestep_function):
+        if self.control_model.double_y_emb:
+            missing, unexpected = self.control_model.orig_y_embedder.load_state_dict(model.diffusion_model.y_embedder.state_dict(), strict=False)
+        else:
+            missing, unexpected = self.control_model.x_embedder.load_state_dict(model.diffusion_model.x_embedder.state_dict(), strict=False)
+        super().pre_run(model, percent_to_timestep_function)
+
+    def copy(self):
+        c = ControlNetSD35(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
+        c.control_model = self.control_model
+        c.control_model_wrapped = self.control_model_wrapped
+        self.copy_to(c)
+        return c
+
+def load_controlnet_sd35(sd, model_options={}):
+    control_type = -1
+    if "control_type" in sd:
+        control_type = round(sd.pop("control_type").item())
+
+    # blur_cnet = control_type == 0
+    canny_cnet = control_type == 1
+    depth_cnet = control_type == 2
+
+    new_sd = {}
+    for k in comfy.utils.MMDIT_MAP_BASIC:
+        if k[1] in sd:
+            new_sd[k[0]] = sd.pop(k[1])
+    for k in sd:
+        new_sd[k] = sd[k]
+    sd = new_sd
+
+    y_emb_shape = sd["y_embedder.mlp.0.weight"].shape
+    depth = y_emb_shape[0] // 64
+    hidden_size = 64 * depth
+    num_heads = depth
+    head_dim = hidden_size // num_heads
+    num_blocks = comfy.model_detection.count_blocks(new_sd, 'transformer_blocks.{}.')
+
+    load_device = comfy.model_management.get_torch_device()
+    offload_device = comfy.model_management.unet_offload_device()
+    unet_dtype = comfy.model_management.unet_dtype(model_params=-1)
+
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
+
+    control_model = comfy.cldm.dit_embedder.ControlNetEmbedder(img_size=None,
+                                                               patch_size=2,
+                                                               in_chans=16,
+                                                               num_layers=num_blocks,
+                                                               main_model_double=depth,
+                                                               double_y_emb=y_emb_shape[0] == y_emb_shape[1],
+                                                               attention_head_dim=head_dim,
+                                                               num_attention_heads=num_heads,
+                                                               adm_in_channels=2048,
+                                                               device=offload_device,
+                                                               dtype=unet_dtype,
+                                                               operations=operations)
+
+    control_model = controlnet_load_state_dict(control_model, sd)
+
+    latent_format = comfy.latent_formats.SD3()
+    preprocess_image = lambda a: a
+    if canny_cnet:
+        preprocess_image = lambda a: (a * 255 * 0.5 + 0.5)
+    elif depth_cnet:
+        preprocess_image = lambda a: 1.0 - a
+
+    control = ControlNetSD35(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, preprocess_image=preprocess_image)
+    return control
+
+
+
+def load_controlnet_hunyuandit(controlnet_data, model_options={}):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data, model_options=model_options)
+
+    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=offload_device, dtype=unet_dtype)
+    control_model = controlnet_load_state_dict(control_model, controlnet_data)
+
+    latent_format = comfy.latent_formats.SDXL()
+    extra_conds = ['text_embedding_mask', 'encoder_hidden_states_t5', 'text_embedding_mask_t5', 'image_meta_size', 'style', 'cos_cis_img', 'sin_cis_img']
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds, strength_type=StrengthType.CONSTANT)
+    return control
+
+def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, sd)
+    extra_conds = ['y', 'guidance']
+    control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
+
+def load_controlnet_flux_instantx(sd, model_options={}):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
+    for k in sd:
+        new_sd[k] = sd[k]
+
+    num_union_modes = 0
+    union_cnet = "controlnet_mode_embedder.weight"
+    if union_cnet in new_sd:
+        num_union_modes = new_sd[union_cnet].shape[0]
+
+    control_latent_channels = new_sd.get("pos_embed_input.weight").shape[1] // 4
+    concat_mask = False
+    if control_latent_channels == 17:
+        concat_mask = True
+
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(latent_input=True, num_union_modes=num_union_modes, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, new_sd)
+
+    latent_format = comfy.latent_formats.Flux()
+    extra_conds = ['y', 'guidance']
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
+
+def convert_mistoline(sd):
+    return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})
+
+
+def load_controlnet_state_dict(state_dict, model=None, model_options={}):
+    controlnet_data = state_dict
+    if 'after_proj_list.18.bias' in controlnet_data.keys(): #Hunyuan DiT
+        return load_controlnet_hunyuandit(controlnet_data, model_options=model_options)
+
+    if "lora_controlnet" in controlnet_data:
+        return ControlLora(controlnet_data, model_options=model_options)
+
+    controlnet_config = None
+    supported_inference_dtypes = None
+
+    if "controlnet_cond_embedding.conv_in.weight" in controlnet_data: #diffusers format
+        controlnet_config = comfy.model_detection.unet_config_from_diffusers_unet(controlnet_data)
+        diffusers_keys = comfy.utils.unet_to_diffusers(controlnet_config)
+        diffusers_keys["controlnet_mid_block.weight"] = "middle_block_out.0.weight"
+        diffusers_keys["controlnet_mid_block.bias"] = "middle_block_out.0.bias"
+
+        count = 0
+        loop = True
+        while loop:
+            suffix = [".weight", ".bias"]
+            for s in suffix:
+                k_in = "controlnet_down_blocks.{}{}".format(count, s)
+                k_out = "zero_convs.{}.0{}".format(count, s)
+                if k_in not in controlnet_data:
+                    loop = False
+                    break
+                diffusers_keys[k_in] = k_out
+            count += 1
+
+        count = 0
+        loop = True
+        while loop:
+            suffix = [".weight", ".bias"]
+            for s in suffix:
+                if count == 0:
+                    k_in = "controlnet_cond_embedding.conv_in{}".format(s)
+                else:
+                    k_in = "controlnet_cond_embedding.blocks.{}{}".format(count - 1, s)
+                k_out = "input_hint_block.{}{}".format(count * 2, s)
+                if k_in not in controlnet_data:
+                    k_in = "controlnet_cond_embedding.conv_out{}".format(s)
+                    loop = False
+                diffusers_keys[k_in] = k_out
+            count += 1
+
+        new_sd = {}
+        for k in diffusers_keys:
+            if k in controlnet_data:
+                new_sd[diffusers_keys[k]] = controlnet_data.pop(k)
+
+        if "control_add_embedding.linear_1.bias" in controlnet_data: #Union Controlnet
+            controlnet_config["union_controlnet_num_control_type"] = controlnet_data["task_embedding"].shape[0]
+            for k in list(controlnet_data.keys()):
+                new_k = k.replace('.attn.in_proj_', '.attn.in_proj.')
+                new_sd[new_k] = controlnet_data.pop(k)
+
+        leftover_keys = controlnet_data.keys()
+        if len(leftover_keys) > 0:
+            logging.warning("leftover keys: {}".format(leftover_keys))
+        controlnet_data = new_sd
+    elif "controlnet_blocks.0.weight" in controlnet_data:
+        if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
+            return load_controlnet_flux_xlabs_mistoline(controlnet_data, model_options=model_options)
+        elif "pos_embed_input.proj.weight" in controlnet_data:
+            if "transformer_blocks.0.adaLN_modulation.1.bias" in controlnet_data:
+                return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format
+            else:
+                return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
+        elif "controlnet_x_embedder.weight" in controlnet_data:
+            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
+    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
+        return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)
+
+    pth_key = 'control_model.zero_convs.0.0.weight'
+    pth = False
+    key = 'zero_convs.0.0.weight'
+    if pth_key in controlnet_data:
+        pth = True
+        key = pth_key
+        prefix = "control_model."
+    elif key in controlnet_data:
+        prefix = ""
+    else:
+        net = load_t2i_adapter(controlnet_data, model_options=model_options)
+        if net is None:
+            logging.error("error could not detect control model type.")
+        return net
+
+    if controlnet_config is None:
+        model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
+        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
+        controlnet_config = model_config.unet_config
+
+    unet_dtype = model_options.get("dtype", None)
+    if unet_dtype is None:
+        weight_dtype = comfy.utils.weight_dtype(controlnet_data)
+
+        if supported_inference_dtypes is None:
+            supported_inference_dtypes = [comfy.model_management.unet_dtype()]
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+
+    load_device = comfy.model_management.get_torch_device()
+
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype)
+
+    controlnet_config["operations"] = operations
+    controlnet_config["dtype"] = unet_dtype
+    controlnet_config["device"] = comfy.model_management.unet_offload_device()
+    controlnet_config.pop("out_channels")
+    controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
+    control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
+
+    if pth:
+        if 'difference' in controlnet_data:
+            if model is not None:
+                comfy.model_management.load_models_gpu([model])
+                model_sd = model.model_state_dict()
+                for x in controlnet_data:
+                    c_m = "control_model."
+                    if x.startswith(c_m):
+                        sd_key = "diffusion_model.{}".format(x[len(c_m):])
+                        if sd_key in model_sd:
+                            cd = controlnet_data[x]
+                            cd += model_sd[sd_key].type(cd.dtype).to(cd.device)
+            else:
+                logging.warning("WARNING: Loaded a diff controlnet without a model. It will very likely not work.")
+
+        class WeightsLoader(torch.nn.Module):
+            pass
+        w = WeightsLoader()
+        w.control_model = control_model
+        missing, unexpected = w.load_state_dict(controlnet_data, strict=False)
+    else:
+        missing, unexpected = control_model.load_state_dict(controlnet_data, strict=False)
+
+    if len(missing) > 0:
+        logging.warning("missing controlnet keys: {}".format(missing))
+
+    if len(unexpected) > 0:
+        logging.debug("unexpected controlnet keys: {}".format(unexpected))
+
+    global_average_pooling = model_options.get("global_average_pooling", False)
+    control = ControlNet(control_model, global_average_pooling=global_average_pooling, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
+    return control
+
+def load_controlnet(ckpt_path, model=None, model_options={}):
+    model_options = model_options.copy()
+    if "global_average_pooling" not in model_options:
+        filename = os.path.splitext(ckpt_path)[0]
+        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
+            model_options["global_average_pooling"] = True
+
+    cnet = load_controlnet_state_dict(comfy.utils.load_torch_file(ckpt_path, safe_load=True), model=model, model_options=model_options)
+    if cnet is None:
+        logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
+    return cnet
+
+class T2IAdapter(ControlBase):
+    def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
+        super().__init__()
+        self.t2i_model = t2i_model
+        self.channels_in = channels_in
+        self.control_input = None
+        self.compression_ratio = compression_ratio
+        self.upscale_algorithm = upscale_algorithm
+        if device is None:
+            device = comfy.model_management.get_torch_device()
+        self.device = device
+
+    def scale_image_to(self, width, height):
+        unshuffle_amount = self.t2i_model.unshuffle_amount
+        width = math.ceil(width / unshuffle_amount) * unshuffle_amount
+        height = math.ceil(height / unshuffle_amount) * unshuffle_amount
+        return width, height
+
+    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
+        control_prev = None
+        if self.previous_controlnet is not None:
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number, transformer_options)
+
+        if self.timestep_range is not None:
+            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
+                if control_prev is not None:
+                    return control_prev
+                else:
+                    return None
+
+        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
+            if self.cond_hint is not None:
+                del self.cond_hint
+            self.control_input = None
+            self.cond_hint = None
+            width, height = self.scale_image_to(x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio)
+            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, width, height, self.upscale_algorithm, "center").float().to(self.device)
+            if self.channels_in == 1 and self.cond_hint.shape[1] > 1:
+                self.cond_hint = torch.mean(self.cond_hint, 1, keepdim=True)
+        if x_noisy.shape[0] != self.cond_hint.shape[0]:
+            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
+        if self.control_input is None:
+            self.t2i_model.to(x_noisy.dtype)
+            self.t2i_model.to(self.device)
+            self.control_input = self.t2i_model(self.cond_hint.to(x_noisy.dtype))
+            self.t2i_model.cpu()
+
+        control_input = {}
+        for k in self.control_input:
+            control_input[k] = list(map(lambda a: None if a is None else a.clone(), self.control_input[k]))
+
+        return self.control_merge(control_input, control_prev, x_noisy.dtype)
+
+    def copy(self):
+        c = T2IAdapter(self.t2i_model, self.channels_in, self.compression_ratio, self.upscale_algorithm)
+        self.copy_to(c)
+        return c
+
+def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
+    compression_ratio = 8
+    upscale_algorithm = 'nearest-exact'
+
+    if 'adapter' in t2i_data:
+        t2i_data = t2i_data['adapter']
+    if 'adapter.body.0.resnets.0.block1.weight' in t2i_data: #diffusers format
+        prefix_replace = {}
+        for i in range(4):
+            for j in range(2):
+                prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
+            prefix_replace["adapter.body.{}.".format(i, )] = "body.{}.".format(i * 2)
+        prefix_replace["adapter."] = ""
+        t2i_data = comfy.utils.state_dict_prefix_replace(t2i_data, prefix_replace)
+    keys = t2i_data.keys()
+
+    if "body.0.in_conv.weight" in keys:
+        cin = t2i_data['body.0.in_conv.weight'].shape[1]
+        model_ad = comfy.t2i_adapter.adapter.Adapter_light(cin=cin, channels=[320, 640, 1280, 1280], nums_rb=4)
+    elif 'conv_in.weight' in keys:
+        cin = t2i_data['conv_in.weight'].shape[1]
+        channel = t2i_data['conv_in.weight'].shape[0]
+        ksize = t2i_data['body.0.block2.weight'].shape[2]
+        use_conv = False
+        down_opts = list(filter(lambda a: a.endswith("down_opt.op.weight"), keys))
+        if len(down_opts) > 0:
+            use_conv = True
+        xl = False
+        if cin == 256 or cin == 768:
+            xl = True
+        model_ad = comfy.t2i_adapter.adapter.Adapter(cin=cin, channels=[channel, channel*2, channel*4, channel*4][:4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv, xl=xl)
+    elif "backbone.0.0.weight" in keys:
+        model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.0.weight'].shape[1], proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
+        compression_ratio = 32
+        upscale_algorithm = 'bilinear'
+    elif "backbone.10.blocks.0.weight" in keys:
+        model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.weight'].shape[1], bottleneck_mode="large", proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
+        compression_ratio = 1
+        upscale_algorithm = 'nearest-exact'
+    else:
+        return None
+
+    missing, unexpected = model_ad.load_state_dict(t2i_data)
+    if len(missing) > 0:
+        logging.warning("t2i missing {}".format(missing))
+
+    if len(unexpected) > 0:
+        logging.debug("t2i unexpected {}".format(unexpected))
+
+    return T2IAdapter(model_ad, model_ad.input_channels, compression_ratio, upscale_algorithm)
--- a/comfy/diffusers_convert.py
+++ b/comfy/diffusers_convert.py
@@ -1,116 +1,9 @@
-import json
-import os
-import yaml
-
-import folder_paths
-from comfy.ldm.util import instantiate_from_config
-from comfy.sd import ModelPatcher, load_model_weights, CLIP, VAE
-import os.path as osp
 import re
 import torch
-from safetensors.torch import load_file, save_file
+import logging

 # conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py

-# =================#
-# UNet Conversion #
-# =================#
-
-unet_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
-    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
-    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
-    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
-    ("input_blocks.0.0.weight", "conv_in.weight"),
-    ("input_blocks.0.0.bias", "conv_in.bias"),
-    ("out.0.weight", "conv_norm_out.weight"),
-    ("out.0.bias", "conv_norm_out.bias"),
-    ("out.2.weight", "conv_out.weight"),
-    ("out.2.bias", "conv_out.bias"),
-]
-
-unet_conversion_map_resnet = [
-    # (stable-diffusion, HF Diffusers)
-    ("in_layers.0", "norm1"),
-    ("in_layers.2", "conv1"),
-    ("out_layers.0", "norm2"),
-    ("out_layers.3", "conv2"),
-    ("emb_layers.1", "time_emb_proj"),
-    ("skip_connection", "conv_shortcut"),
-]
-
-unet_conversion_map_layer = []
-# hardcoded number of downblocks and resnets/attentions...
-# would need smarter logic for other networks.
-for i in range(4):
-    # loop over downblocks/upblocks
-
-    for j in range(2):
-        # loop over resnets/attentions for downblocks
-        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
-        sd_down_res_prefix = f"input_blocks.{3 * i + j + 1}.0."
-        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
-
-        if i < 3:
-            # no attention layers in down_blocks.3
-            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
-            sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
-            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
-
-    for j in range(3):
-        # loop over resnets/attentions for upblocks
-        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
-        sd_up_res_prefix = f"output_blocks.{3 * i + j}.0."
-        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
-
-        if i > 0:
-            # no attention layers in up_blocks.0
-            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
-            sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
-            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
-
-    if i < 3:
-        # no downsample in down_blocks.3
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
-        sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
-        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
-
-        # no upsample in up_blocks.3
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"output_blocks.{3 * i + 2}.{1 if i == 0 else 2}."
-        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
-
-hf_mid_atn_prefix = "mid_block.attentions.0."
-sd_mid_atn_prefix = "middle_block.1."
-unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
-
-for j in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{j}."
-    sd_mid_res_prefix = f"middle_block.{2 * j}."
-    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
-
-
-def convert_unet_state_dict(unet_state_dict):
-    # buyer beware: this is a *brittle* function,
-    # and correct output requires that all of these pieces interact in
-    # the exact order in which I have arranged them.
-    mapping = {k: k for k in unet_state_dict.keys()}
-    for sd_name, hf_name in unet_conversion_map:
-        mapping[hf_name] = sd_name
-    for k, v in mapping.items():
-        if "resnets" in k:
-            for sd_part, hf_part in unet_conversion_map_resnet:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    for k, v in mapping.items():
-        for sd_part, hf_part in unet_conversion_map_layer:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
-    return new_state_dict
-
-
 # ================#
 # VAE Conversion #
 # ================#
@@ -157,20 +50,31 @@ vae_conversion_map_attn = [
    ("q.", "query."),
    ("k.", "key."),
    ("v.", "value."),
+    ("q.", "to_q."),
+    ("k.", "to_k."),
+    ("v.", "to_v."),
+    ("proj_out.", "to_out.0."),
    ("proj_out.", "proj_attn."),
 ]


-def reshape_weight_for_sd(w):
+def reshape_weight_for_sd(w, conv3d=False):
    # convert HF linear weights to SD conv2d weights
-    return w.reshape(*w.shape, 1, 1)
+    if conv3d:
+        return w.reshape(*w.shape, 1, 1, 1)
+    else:
+        return w.reshape(*w.shape, 1, 1)


 def convert_vae_state_dict(vae_state_dict):
    mapping = {k: k for k in vae_state_dict.keys()}
+    conv3d = False
    for k, v in mapping.items():
        for sd_part, hf_part in vae_conversion_map:
            v = v.replace(hf_part, sd_part)
+        if v.endswith(".conv.weight"):
+            if not conv3d and vae_state_dict[k].ndim == 5:
+                conv3d = True
        mapping[k] = v
    for k, v in mapping.items():
        if "attentions" in k:
@@ -182,8 +86,8 @@ def convert_vae_state_dict(vae_state_dict):
    for k, v in new_state_dict.items():
        for weight_name in weights_to_convert:
            if f"mid.attn_1.{weight_name}.weight" in k:
-                print(f"Reshaping {k} for SD format")
-                new_state_dict[k] = reshape_weight_for_sd(v)
+                logging.debug(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v, conv3d=conv3d)
    return new_state_dict


@@ -211,11 +115,30 @@ textenc_pattern = re.compile("|".join(protected.keys()))
 code2idx = {"q": 0, "k": 1, "v": 2}


-def convert_text_enc_state_dict_v20(text_enc_dict):
+# This function exists because at the time of writing torch.cat can't do fp8 with cuda
+def cat_tensors(tensors):
+    x = 0
+    for t in tensors:
+        x += t.shape[0]
+
+    shape = [x] + list(tensors[0].shape)[1:]
+    out = torch.empty(shape, device=tensors[0].device, dtype=tensors[0].dtype)
+
+    x = 0
+    for t in tensors:
+        out[x:x + t.shape[0]] = t
+        x += t.shape[0]
+
+    return out
+
+
+def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
    new_state_dict = {}
    capture_qkv_weight = {}
    capture_qkv_bias = {}
    for k, v in text_enc_dict.items():
+        if not k.startswith(prefix):
+            continue
        if (
                k.endswith(".self_attn.q_proj.weight")
                or k.endswith(".self_attn.k_proj.weight")
@@ -240,123 +163,27 @@ def convert_text_enc_state_dict_v20(text_enc_dict):
            capture_qkv_bias[k_pre][code2idx[k_code]] = v
            continue

-        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
-        new_state_dict[relabelled_key] = v
+        text_proj = "transformer.text_projection.weight"
+        if k.endswith(text_proj):
+            new_state_dict[k.replace(text_proj, "text_projection")] = v.transpose(0, 1).contiguous()
+        else:
+            relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
+            new_state_dict[relabelled_key] = v

    for k_pre, tensors in capture_qkv_weight.items():
        if None in tensors:
            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
-        new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors)
+        new_state_dict[relabelled_key + ".in_proj_weight"] = cat_tensors(tensors)

    for k_pre, tensors in capture_qkv_bias.items():
        if None in tensors:
            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
-        new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors)
+        new_state_dict[relabelled_key + ".in_proj_bias"] = cat_tensors(tensors)

    return new_state_dict


 def convert_text_enc_state_dict(text_enc_dict):
    return text_enc_dict
-
-
-def load_diffusers(model_path, fp16=True, output_vae=True, output_clip=True, embedding_directory=None):
-    diffusers_unet_conf = json.load(open(osp.join(model_path, "unet/config.json")))
-    diffusers_scheduler_conf = json.load(open(osp.join(model_path, "scheduler/scheduler_config.json")))
-
-    # magic
-    v2 = diffusers_unet_conf["sample_size"] == 96
-    if 'prediction_type' in diffusers_scheduler_conf:
-        v_pred = diffusers_scheduler_conf['prediction_type'] == 'v_prediction'
-
-    if v2:
-        if v_pred:
-            config_path = folder_paths.get_full_path("configs", 'v2-inference-v.yaml')
-        else:
-            config_path = folder_paths.get_full_path("configs", 'v2-inference.yaml')
-    else:
-        config_path = folder_paths.get_full_path("configs", 'v1-inference.yaml')
-
-    with open(config_path, 'r') as stream:
-        config = yaml.safe_load(stream)
-
-    model_config_params = config['model']['params']
-    clip_config = model_config_params['cond_stage_config']
-    scale_factor = model_config_params['scale_factor']
-    vae_config = model_config_params['first_stage_config']
-    vae_config['scale_factor'] = scale_factor
-    model_config_params["unet_config"]["params"]["use_fp16"] = fp16
-
-    unet_path = osp.join(model_path, "unet", "diffusion_pytorch_model.safetensors")
-    vae_path = osp.join(model_path, "vae", "diffusion_pytorch_model.safetensors")
-    text_enc_path = osp.join(model_path, "text_encoder", "model.safetensors")
-
-    # Load models from safetensors if it exists, if it doesn't pytorch
-    if osp.exists(unet_path):
-        unet_state_dict = load_file(unet_path, device="cpu")
-    else:
-        unet_path = osp.join(model_path, "unet", "diffusion_pytorch_model.bin")
-        unet_state_dict = torch.load(unet_path, map_location="cpu")
-
-    if osp.exists(vae_path):
-        vae_state_dict = load_file(vae_path, device="cpu")
-    else:
-        vae_path = osp.join(model_path, "vae", "diffusion_pytorch_model.bin")
-        vae_state_dict = torch.load(vae_path, map_location="cpu")
-
-    if osp.exists(text_enc_path):
-        text_enc_dict = load_file(text_enc_path, device="cpu")
-    else:
-        text_enc_path = osp.join(model_path, "text_encoder", "pytorch_model.bin")
-        text_enc_dict = torch.load(text_enc_path, map_location="cpu")
-
-    # Convert the UNet model
-    unet_state_dict = convert_unet_state_dict(unet_state_dict)
-    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
-
-    # Convert the VAE model
-    vae_state_dict = convert_vae_state_dict(vae_state_dict)
-    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
-
-    # Easiest way to identify v2.0 model seems to be that the text encoder (OpenCLIP) is deeper
-    is_v20_model = "text_model.encoder.layers.22.layer_norm2.bias" in text_enc_dict
-
-    if is_v20_model:
-        # Need to add the tag 'transformer' in advance so we can knock it out from the final layer-norm
-        text_enc_dict = {"transformer." + k: v for k, v in text_enc_dict.items()}
-        text_enc_dict = convert_text_enc_state_dict_v20(text_enc_dict)
-        text_enc_dict = {"cond_stage_model.model." + k: v for k, v in text_enc_dict.items()}
-    else:
-        text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
-        text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()}
-
-    # Put together new checkpoint
-    sd = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
-
-    clip = None
-    vae = None
-
-    class WeightsLoader(torch.nn.Module):
-        pass
-
-    w = WeightsLoader()
-    load_state_dict_to = []
-    if output_vae:
-        vae = VAE(scale_factor=scale_factor, config=vae_config)
-        w.first_stage_model = vae.first_stage_model
-        load_state_dict_to = [w]
-
-    if output_clip:
-        clip = CLIP(config=clip_config, embedding_directory=embedding_directory)
-        w.cond_stage_model = clip.cond_stage_model
-        load_state_dict_to = [w]
-
-    model = instantiate_from_config(config["model"])
-    model = load_model_weights(model, sd, verbose=False, load_state_dict_to=load_state_dict_to)
-
-    if fp16:
-        model = model.half()
-
-    return ModelPatcher(model), clip, vae
--- a/comfy/diffusers_load.py
+++ b/comfy/diffusers_load.py
@@ -0,0 +1,36 @@
+import os
+
+import comfy.sd
+
+def first_file(path, filenames):
+    for f in filenames:
+        p = os.path.join(path, f)
+        if os.path.exists(p):
+            return p
+    return None
+
+def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_directory=None):
+    diffusion_model_names = ["diffusion_pytorch_model.fp16.safetensors", "diffusion_pytorch_model.safetensors", "diffusion_pytorch_model.fp16.bin", "diffusion_pytorch_model.bin"]
+    unet_path = first_file(os.path.join(model_path, "unet"), diffusion_model_names)
+    vae_path = first_file(os.path.join(model_path, "vae"), diffusion_model_names)
+
+    text_encoder_model_names = ["model.fp16.safetensors", "model.safetensors", "pytorch_model.fp16.bin", "pytorch_model.bin"]
+    text_encoder1_path = first_file(os.path.join(model_path, "text_encoder"), text_encoder_model_names)
+    text_encoder2_path = first_file(os.path.join(model_path, "text_encoder_2"), text_encoder_model_names)
+
+    text_encoder_paths = [text_encoder1_path]
+    if text_encoder2_path is not None:
+        text_encoder_paths.append(text_encoder2_path)
+
+    unet = comfy.sd.load_diffusion_model(unet_path)
+
+    clip = None
+    if output_clip:
+        clip = comfy.sd.load_clip(text_encoder_paths, embedding_directory=embedding_directory)
+
+    vae = None
+    if output_vae:
+        sd = comfy.utils.load_torch_file(vae_path)
+        vae = comfy.sd.VAE(sd=sd)
+
+    return (unet, clip, vae)
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@@ -1,10 +1,10 @@
 #code taken from: https://github.com/wl-zhao/UniPC and modified

 import torch
-import torch.nn.functional as F
 import math
+import logging

-from tqdm.auto import trange, tqdm
+from tqdm.auto import trange


 class NoiseScheduleVP:
@@ -16,7 +16,7 @@ class NoiseScheduleVP:
            continuous_beta_0=0.1,
            continuous_beta_1=20.,
        ):
-        """Create a wrapper class for the forward SDE (VP type).
+        r"""Create a wrapper class for the forward SDE (VP type).

        ***
        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
@@ -80,7 +80,7 @@ class NoiseScheduleVP:
                    'linear' or 'cosine' for continuous-time DPMs.
        Returns:
            A wrapper object of the forward SDE (VP type).
-        
+
        ===============================================================

        Example:
@@ -180,7 +180,6 @@ class NoiseScheduleVP:

 def model_wrapper(
    model,
-    sampling_function,
    noise_schedule,
    model_type="noise",
    model_kwargs={},
@@ -209,7 +208,7 @@ def model_wrapper(
                arXiv preprint arXiv:2202.00512 (2022).
            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
                arXiv preprint arXiv:2210.02303 (2022).
-    
+
        4. "score": marginal score function. (Trained by denoising score matching).
            Note that the score function and the noise prediction model follows a simple relationship:
            ```
@@ -227,7 +226,7 @@ def model_wrapper(
            The input `model` has the following format:
            ``
                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
-            `` 
+            ``

            The input `classifier_fn` has the following format:
            ``
@@ -241,12 +240,12 @@ def model_wrapper(
            The input `model` has the following format:
            ``
                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
-            `` 
+            ``
            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.

            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
                arXiv preprint arXiv:2207.12598 (2022).
-        
+

    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
    or continuous-time labels (i.e. epsilon to T).
@@ -255,7 +254,7 @@ def model_wrapper(
    ``
        def model_fn(x, t_continuous) -> noise:
            t_input = get_model_input_time(t_continuous)
-            return noise_pred(model, x, t_input, **model_kwargs)         
+            return noise_pred(model, x, t_input, **model_kwargs)
    ``
    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.

@@ -295,7 +294,7 @@ def model_wrapper(
        if t_continuous.reshape((-1,)).shape[0] == 1:
            t_continuous = t_continuous.expand((x.shape[0]))
        t_input = get_model_input_time(t_continuous)
-        output = sampling_function(model, x, t_input, **model_kwargs)
+        output = model(x, t_input, **model_kwargs)
        if model_type == "noise":
            return output
        elif model_type == "x_start":
@@ -359,11 +358,8 @@ class UniPC:
        thresholding=False,
        max_val=1.,
        variant='bh1',
-        noise_mask=None,
-        masked_image=None,
-        noise=None,
    ):
-        """Construct a UniPC. 
+        """Construct a UniPC.

        We support both data_prediction and noise_prediction.
        """
@@ -373,13 +369,10 @@ class UniPC:
        self.predict_x0 = predict_x0
        self.thresholding = thresholding
        self.max_val = max_val
-        self.noise_mask = noise_mask
-        self.masked_image = masked_image
-        self.noise = noise

    def dynamic_thresholding_fn(self, x0, t=None):
        """
-        The dynamic thresholding method. 
+        The dynamic thresholding method.
        """
        dims = x0.dim()
        p = self.dynamic_thresholding_ratio
@@ -392,10 +385,7 @@ class UniPC:
        """
        Return the noise prediction model.
        """
-        if self.noise_mask is not None:
-            return self.model(x, t) * self.noise_mask
-        else:
-            return self.model(x, t)
+        return self.model(x, t)

    def data_prediction_fn(self, x, t):
        """
@@ -410,13 +400,11 @@ class UniPC:
            s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
            s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
            x0 = torch.clamp(x0, -s, s) / s
-        if self.noise_mask is not None:
-            x0 = x0 * self.noise_mask + (1. - self.noise_mask) * self.masked_image
        return x0

    def model_fn(self, x, t):
        """
-        Convert the model to the noise prediction model or the data prediction model. 
+        Convert the model to the noise prediction model or the data prediction model.
        """
        if self.predict_x0:
            return self.data_prediction_fn(x, t)
@@ -473,7 +461,7 @@ class UniPC:

    def denoise_to_zero_fn(self, x, s):
        """
-        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization. 
+        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
        """
        return self.data_prediction_fn(x, s)

@@ -487,7 +475,7 @@ class UniPC:
            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)

    def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
-        print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
+        logging.info(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
        ns = self.noise_schedule
        assert order <= len(model_prev_list)

@@ -522,7 +510,7 @@ class UniPC:
        col = torch.ones_like(rks)
        for k in range(1, K + 1):
            C.append(col)
-            col = col * rks / (k + 1) 
+            col = col * rks / (k + 1)
        C = torch.stack(C, dim=1)

        if len(D1s) > 0:
@@ -531,7 +519,6 @@ class UniPC:
            A_p = C_inv_p

        if use_corrector:
-            print('using corrector')
            C_inv = torch.linalg.inv(C)
            A_c = C_inv

@@ -634,12 +621,12 @@ class UniPC:
            B_h = torch.expm1(hh)
        else:
            raise NotImplementedError()
-            
+
        for i in range(1, order + 1):
            R.append(torch.pow(rks, i - 1))
            b.append(h_phi_k * factorial_i / B_h)
            factorial_i *= (i + 1)
-            h_phi_k = h_phi_k / hh - 1 / factorial_i 
+            h_phi_k = h_phi_k / hh - 1 / factorial_i

        R = torch.stack(R)
        b = torch.tensor(b, device=x.device)
@@ -674,7 +661,7 @@ class UniPC:

            if x_t is None:
                if use_predictor:
-                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+                    pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_p, D1s)
                else:
                    pred_res = 0
                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
@@ -682,14 +669,14 @@ class UniPC:
            if use_corrector:
                model_t = self.model_fn(x_t, t)
                if D1s is not None:
-                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+                    corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
                else:
                    corr_res = 0
                D1_t = (model_t - model_prev_0)
                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
        else:
            x_t_ = (
-                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dimss) * x
+                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
                - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
            )
            if x_t is None:
@@ -714,9 +701,8 @@ class UniPC:
        method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
        atol=0.0078, rtol=0.05, corrector=False, callback=None, disable_pbar=False
    ):
-        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
-        t_T = self.noise_schedule.T if t_start is None else t_start
-        device = x.device
+        # t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        # t_T = self.noise_schedule.T if t_start is None else t_start
        steps = len(timesteps) - 1
        if method == 'multistep':
            assert steps >= order
@@ -724,8 +710,6 @@ class UniPC:
            assert timesteps.shape[0] - 1 == steps
            # with torch.no_grad():
            for step_index in trange(steps, disable=disable_pbar):
-                if self.noise_mask is not None:
-                    x = x * self.noise_mask + (1. - self.noise_mask) * (self.masked_image * self.noise_schedule.marginal_alpha(timesteps[step_index]) + self.noise * self.noise_schedule.marginal_std(timesteps[step_index]))
                if step_index == 0:
                    vec_t = timesteps[0].expand((x.shape[0]))
                    model_prev_list = [self.model_fn(x, vec_t)]
@@ -767,11 +751,11 @@ class UniPC:
                                model_x = self.model_fn(x, vec_t)
                            model_prev_list[-1] = model_x
                if callback is not None:
-                    callback(step_index, model_prev_list[-1], x, steps)
+                    callback({'x': x, 'i': step_index, 'denoised': model_prev_list[-1]})
        else:
            raise NotImplementedError()
-        if denoise_to_zero:
-            x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
+        # if denoise_to_zero:
+        #     x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
        return x


@@ -834,52 +818,56 @@ def expand_dims(v, dims):
    return v[(...,) + (None,)*(dims - 1)]


+class SigmaConvert:
+    schedule = ""
+    def marginal_log_mean_coeff(self, sigma):
+        return 0.5 * torch.log(1 / ((sigma * sigma) + 1))

-def sample_unipc(model, noise, image, sigmas, sampling_function, max_denoise, extra_args=None, callback=None, disable=False, noise_mask=None, variant='bh1'):
-        to_zero = False
+    def marginal_alpha(self, t):
+        return torch.exp(self.marginal_log_mean_coeff(t))
+
+    def marginal_std(self, t):
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+
+def predict_eps_sigma(model, input, sigma_in, **kwargs):
+    sigma = sigma_in.view(sigma_in.shape[:1] + (1,) * (input.ndim - 1))
+    input = input * ((sigma ** 2 + 1.0) ** 0.5)
+    return  (input - model(input, sigma_in, **kwargs)) / sigma
+
+
+def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
+        timesteps = sigmas.clone()
        if sigmas[-1] == 0:
-            timesteps = torch.nn.functional.interpolate(sigmas[None,None,:-1], size=(len(sigmas),), mode='linear')[0][0]
-            to_zero = True
+            timesteps = sigmas[:]
+            timesteps[-1] = 0.001
        else:
            timesteps = sigmas.clone()
+        ns = SigmaConvert()

-        for s in range(timesteps.shape[0]):
-            timesteps[s] = (model.sigma_to_t(timesteps[s]) / 1000) + (1 / len(model.sigmas))
-
-        ns = NoiseScheduleVP('discrete', alphas_cumprod=model.inner_model.alphas_cumprod)
-
-        if image is not None:
-            img = image * ns.marginal_alpha(timesteps[0])
-            if max_denoise:
-                noise_mult = 1.0
-            else:
-                noise_mult = ns.marginal_std(timesteps[0])
-            img += noise * noise_mult
-        else:
-            img = noise
-
-        if to_zero:
-            timesteps[-1] = (1 / len(model.sigmas))
-
-        device = noise.device
-
-        if model.parameterization == "v":
-            model_type = "v"
-        else:
-            model_type = "noise"
+        noise = noise / torch.sqrt(1.0 + timesteps[0] ** 2.0)
+        model_type = "noise"

        model_fn = model_wrapper(
-            model.inner_model.inner_model.apply_model,
-            sampling_function,
+            lambda input, sigma, **kwargs: predict_eps_sigma(model, input, sigma, **kwargs),
            ns,
            model_type=model_type,
            guidance_type="uncond",
            model_kwargs=extra_args,
        )

-        order = min(3, len(timesteps) - 1)
-        uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, noise_mask=noise_mask, masked_image=image, noise=noise, variant=variant)
-        x = uni_pc.sample(img, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable)
-        if not to_zero:
-            x /= ns.marginal_alpha(timesteps[-1])
+        order = min(3, len(timesteps) - 2)
+        uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, variant=variant)
+        x = uni_pc.sample(noise, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable)
+        x /= ns.marginal_alpha(timesteps[-1])
        return x
+
+def sample_unipc_bh2(model, noise, sigmas, extra_args=None, callback=None, disable=False):
+    return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2')
--- a/comfy/float.py
+++ b/comfy/float.py
@@ -0,0 +1,67 @@
+import torch
+
+def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
+    mantissa_scaled = torch.where(
+        normal_mask,
+        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
+        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
+    )
+
+    mantissa_scaled += torch.rand(mantissa_scaled.size(), dtype=mantissa_scaled.dtype, layout=mantissa_scaled.layout, device=mantissa_scaled.device, generator=generator)
+    return mantissa_scaled.floor() / (2**MANTISSA_BITS)
+
+#Not 100% sure about this
+def manual_stochastic_round_to_float8(x, dtype, generator=None):
+    if dtype == torch.float8_e4m3fn:
+        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 4, 3, 7
+    elif dtype == torch.float8_e5m2:
+        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 5, 2, 15
+    else:
+        raise ValueError("Unsupported dtype")
+
+    x = x.half()
+    sign = torch.sign(x)
+    abs_x = x.abs()
+    sign = torch.where(abs_x == 0, 0, sign)
+
+    # Combine exponent calculation and clamping
+    exponent = torch.clamp(
+        torch.floor(torch.log2(abs_x)) + EXPONENT_BIAS,
+        0, 2**EXPONENT_BITS - 1
+    )
+
+    # Combine mantissa calculation and rounding
+    normal_mask = ~(exponent == 0)
+
+    abs_x[:] = calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=generator)
+
+    sign *= torch.where(
+        normal_mask,
+        (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + abs_x),
+        (2.0 ** (-EXPONENT_BIAS + 1)) * abs_x
+    )
+
+    inf = torch.finfo(dtype)
+    torch.clamp(sign, min=inf.min, max=inf.max, out=sign)
+    return sign
+
+
+
+def stochastic_rounding(value, dtype, seed=0):
+    if dtype == torch.float32:
+        return value.to(dtype=torch.float32)
+    if dtype == torch.float16:
+        return value.to(dtype=torch.float16)
+    if dtype == torch.bfloat16:
+        return value.to(dtype=torch.bfloat16)
+    if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
+        generator = torch.Generator(device=value.device)
+        generator.manual_seed(seed)
+        output = torch.empty_like(value, dtype=dtype)
+        num_slices = max(1, (value.numel() / (4096 * 4096)))
+        slice_size = max(1, round(value.shape[0] / num_slices))
+        for i in range(0, value.shape[0], slice_size):
+            output[i:i+slice_size].copy_(manual_stochastic_round_to_float8(value[i:i+slice_size], dtype, generator=generator))
+        return output
+
+    return value.to(dtype=dtype)
--- a/comfy/gligen.py
+++ b/comfy/gligen.py
@@ -1,8 +1,10 @@
+import math
 import torch
-from torch import nn, einsum
+from torch import nn
 from .ldm.modules.attention import CrossAttention
 from inspect import isfunction
-
+import comfy.ops
+ops = comfy.ops.manual_cast

 def exists(val):
    return val is not None
@@ -22,7 +24,7 @@ def default(val, d):
 class GEGLU(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
+        self.proj = ops.Linear(dim_in, dim_out * 2)

    def forward(self, x):
        x, gate = self.proj(x).chunk(2, dim=-1)
@@ -35,14 +37,14 @@ class FeedForward(nn.Module):
        inner_dim = int(dim * mult)
        dim_out = default(dim_out, dim)
        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
+            ops.Linear(dim, inner_dim),
            nn.GELU()
        ) if not glu else GEGLU(dim, inner_dim)

        self.net = nn.Sequential(
            project_in,
            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
+            ops.Linear(inner_dim, dim_out)
        )

    def forward(self, x):
@@ -57,11 +59,12 @@ class GatedCrossAttentionDense(nn.Module):
            query_dim=query_dim,
            context_dim=context_dim,
            heads=n_heads,
-            dim_head=d_head)
+            dim_head=d_head,
+            operations=ops)
        self.ff = FeedForward(query_dim, glu=True)

-        self.norm1 = nn.LayerNorm(query_dim)
-        self.norm2 = nn.LayerNorm(query_dim)
+        self.norm1 = ops.LayerNorm(query_dim)
+        self.norm2 = ops.LayerNorm(query_dim)

        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
@@ -87,17 +90,18 @@ class GatedSelfAttentionDense(nn.Module):

        # we need a linear projection since we need cat visual feature and obj
        # feature
-        self.linear = nn.Linear(context_dim, query_dim)
+        self.linear = ops.Linear(context_dim, query_dim)

        self.attn = CrossAttention(
            query_dim=query_dim,
            context_dim=query_dim,
            heads=n_heads,
-            dim_head=d_head)
+            dim_head=d_head,
+            operations=ops)
        self.ff = FeedForward(query_dim, glu=True)

-        self.norm1 = nn.LayerNorm(query_dim)
-        self.norm2 = nn.LayerNorm(query_dim)
+        self.norm1 = ops.LayerNorm(query_dim)
+        self.norm2 = ops.LayerNorm(query_dim)

        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
@@ -126,14 +130,14 @@ class GatedSelfAttentionDense2(nn.Module):

        # we need a linear projection since we need cat visual feature and obj
        # feature
-        self.linear = nn.Linear(context_dim, query_dim)
+        self.linear = ops.Linear(context_dim, query_dim)

        self.attn = CrossAttention(
-            query_dim=query_dim, context_dim=query_dim, dim_head=d_head)
+            query_dim=query_dim, context_dim=query_dim, dim_head=d_head, operations=ops)
        self.ff = FeedForward(query_dim, glu=True)

-        self.norm1 = nn.LayerNorm(query_dim)
-        self.norm2 = nn.LayerNorm(query_dim)
+        self.norm1 = ops.LayerNorm(query_dim)
+        self.norm2 = ops.LayerNorm(query_dim)

        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
@@ -201,11 +205,11 @@ class PositionNet(nn.Module):
        self.position_dim = fourier_freqs * 2 * 4  # 2 is sin&cos, 4 is xyxy

        self.linears = nn.Sequential(
-            nn.Linear(self.in_dim + self.position_dim, 512),
+            ops.Linear(self.in_dim + self.position_dim, 512),
            nn.SiLU(),
-            nn.Linear(512, 512),
+            ops.Linear(512, 512),
            nn.SiLU(),
-            nn.Linear(512, out_dim),
+            ops.Linear(512, out_dim),
        )

        self.null_positive_feature = torch.nn.Parameter(
@@ -216,13 +220,14 @@ class PositionNet(nn.Module):
    def forward(self, boxes, masks, positive_embeddings):
        B, N, _ = boxes.shape
        masks = masks.unsqueeze(-1)
+        positive_embeddings = positive_embeddings

        # embedding position (it may includes padding as placeholder)
        xyxy_embedding = self.fourier_embedder(boxes)  # B*N*4 --> B*N*C

        # learnable null embedding
-        positive_null = self.null_positive_feature.view(1, 1, -1)
-        xyxy_null = self.null_position_feature.view(1, 1, -1)
+        positive_null = self.null_positive_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
+        xyxy_null = self.null_position_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)

        # replace padding with learnable null embedding
        positive_embeddings = positive_embeddings * \
@@ -242,28 +247,15 @@ class Gligen(nn.Module):
        self.position_net = position_net
        self.key_dim = key_dim
        self.max_objs = 30
-        self.lowvram = False
+        self.current_device = torch.device("cpu")

    def _set_position(self, boxes, masks, positive_embeddings):
-        if self.lowvram == True:
-            self.position_net.to(boxes.device)
-
        objs = self.position_net(boxes, masks, positive_embeddings)
-
-        if self.lowvram == True:
-            self.position_net.cpu()
-            def func_lowvram(key, x):
-                module = self.module_list[key]
-                module.to(x.device)
-                r = module(x, objs)
-                module.cpu()
-                return r
-            return func_lowvram
-        else:
-            def func(key, x):
-                module = self.module_list[key]
-                return module(x, objs)
-            return func
+        def func(x, extra_options):
+            key = extra_options["transformer_index"]
+            module = self.module_list[key]
+            return module(x, objs.to(device=x.device, dtype=x.dtype))
+        return func

    def set_position(self, latent_image_shape, position_params, device):
        batch, c, h, w = latent_image_shape
@@ -308,14 +300,6 @@ class Gligen(nn.Module):
            masks.to(device),
            conds.to(device))

-    def set_lowvram(self, value=True):
-        self.lowvram = value
-
-    def cleanup(self):
-        self.lowvram = False
-
-    def get_models(self):
-        return [self]

 def load_gligen(sd):
    sd_k = sd.keys()
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@@ -0,0 +1,785 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Callable
+import enum
+import math
+import torch
+import numpy as np
+import itertools
+import logging
+
+if TYPE_CHECKING:
+    from comfy.model_patcher import ModelPatcher, PatcherInjection
+    from comfy.model_base import BaseModel
+    from comfy.sd import CLIP
+import comfy.lora
+import comfy.model_management
+import comfy.patcher_extension
+from node_helpers import conditioning_set_values
+
+# #######################################################################################################
+# Hooks explanation
+# -------------------
+# The purpose of hooks is to allow conds to influence sampling without the need for ComfyUI core code to
+# make explicit special cases like it does for ControlNet and GLIGEN.
+#
+# This is necessary for nodes/features that are intended for use with masked or scheduled conds, or those
+# that should run special code when a 'marked' cond is used in sampling.
+# #######################################################################################################
+
+class EnumHookMode(enum.Enum):
+    '''
+    Priority of hook memory optimization vs. speed, mostly related to WeightHooks.
+
+    MinVram: No caching will occur for any operations related to hooks.
+    MaxSpeed: Excess VRAM (and RAM, once VRAM is sufficiently depleted) will be used to cache hook weights when switching hook groups.
+    '''
+    MinVram = "minvram"
+    MaxSpeed = "maxspeed"
+
+class EnumHookType(enum.Enum):
+    '''
+    Hook types, each of which has different expected behavior.
+    '''
+    Weight = "weight"
+    ObjectPatch = "object_patch"
+    AdditionalModels = "add_models"
+    TransformerOptions = "transformer_options"
+    Injections = "add_injections"
+
+class EnumWeightTarget(enum.Enum):
+    Model = "model"
+    Clip = "clip"
+
+class EnumHookScope(enum.Enum):
+    '''
+    Determines if hook should be limited in its influence over sampling.
+
+    AllConditioning: hook will affect all conds used in sampling.
+    HookedOnly: hook will only affect the conds it was attached to.
+    '''
+    AllConditioning = "all_conditioning"
+    HookedOnly = "hooked_only"
+
+
+class _HookRef:
+    pass
+
+
+def default_should_register(hook: Hook, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+    '''Example for how custom_should_register function can look like.'''
+    return True
+
+
+def create_target_dict(target: EnumWeightTarget=None, **kwargs) -> dict[str]:
+    '''Creates base dictionary for use with Hooks' target param.'''
+    d = {}
+    if target is not None:
+        d['target'] = target
+    d.update(kwargs)
+    return d
+
+
+class Hook:
+    def __init__(self, hook_type: EnumHookType=None, hook_ref: _HookRef=None, hook_id: str=None,
+                 hook_keyframe: HookKeyframeGroup=None, hook_scope=EnumHookScope.AllConditioning):
+        self.hook_type = hook_type
+        '''Enum identifying the general class of this hook.'''
+        self.hook_ref = hook_ref if hook_ref else _HookRef()
+        '''Reference shared between hook clones that have the same value. Should NOT be modified.'''
+        self.hook_id = hook_id
+        '''Optional string ID to identify hook; useful if need to consolidate duplicates at registration time.'''
+        self.hook_keyframe = hook_keyframe if hook_keyframe else HookKeyframeGroup()
+        '''Keyframe storage that can be referenced to get strength for current sampling step.'''
+        self.hook_scope = hook_scope
+        '''Scope of where this hook should apply in terms of the conds used in sampling run.'''
+        self.custom_should_register = default_should_register
+        '''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
+
+    @property
+    def strength(self):
+        return self.hook_keyframe.strength
+
+    def initialize_timesteps(self, model: BaseModel):
+        self.reset()
+        self.hook_keyframe.initialize_timesteps(model)
+
+    def reset(self):
+        self.hook_keyframe.reset()
+
+    def clone(self):
+        c: Hook = self.__class__()
+        c.hook_type = self.hook_type
+        c.hook_ref = self.hook_ref
+        c.hook_id = self.hook_id
+        c.hook_keyframe = self.hook_keyframe
+        c.hook_scope = self.hook_scope
+        c.custom_should_register = self.custom_should_register
+        return c
+
+    def should_register(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        return self.custom_should_register(self, model, model_options, target_dict, registered)
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("add_hook_patches should be defined for Hook subclasses")
+
+    def __eq__(self, other: Hook):
+        return self.__class__ == other.__class__ and self.hook_ref == other.hook_ref
+
+    def __hash__(self):
+        return hash(self.hook_ref)
+
+class WeightHook(Hook):
+    '''
+    Hook responsible for tracking weights to be applied to some model/clip.
+
+    Note, value of hook_scope is ignored and is treated as HookedOnly.
+    '''
+    def __init__(self, strength_model=1.0, strength_clip=1.0):
+        super().__init__(hook_type=EnumHookType.Weight, hook_scope=EnumHookScope.HookedOnly)
+        self.weights: dict = None
+        self.weights_clip: dict = None
+        self.need_weight_init = True
+        self._strength_model = strength_model
+        self._strength_clip = strength_clip
+        self.hook_scope = EnumHookScope.HookedOnly # this value does not matter for WeightHooks, just for docs
+
+    @property
+    def strength_model(self):
+        return self._strength_model * self.strength
+
+    @property
+    def strength_clip(self):
+        return self._strength_clip * self.strength
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
+            return False
+        weights = None
+
+        target = target_dict.get('target', None)
+        if target == EnumWeightTarget.Clip:
+            strength = self._strength_clip
+        else:
+            strength = self._strength_model
+
+        if self.need_weight_init:
+            key_map = {}
+            if target == EnumWeightTarget.Clip:
+                key_map = comfy.lora.model_lora_keys_clip(model.model, key_map)
+            else:
+                key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
+            weights = comfy.lora.load_lora(self.weights, key_map, log_missing=False)
+        else:
+            if target == EnumWeightTarget.Clip:
+                weights = self.weights_clip
+            else:
+                weights = self.weights
+        model.add_hook_patches(hook=self, patches=weights, strength_patch=strength)
+        registered.add(self)
+        return True
+        # TODO: add logs about any keys that were not applied
+
+    def clone(self):
+        c: WeightHook = super().clone()
+        c.weights = self.weights
+        c.weights_clip = self.weights_clip
+        c.need_weight_init = self.need_weight_init
+        c._strength_model = self._strength_model
+        c._strength_clip = self._strength_clip
+        return c
+
+class ObjectPatchHook(Hook):
+    def __init__(self, object_patches: dict[str]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.ObjectPatch)
+        self.object_patches = object_patches
+        self.hook_scope = hook_scope
+
+    def clone(self):
+        c: ObjectPatchHook = super().clone()
+        c.object_patches = self.object_patches
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("ObjectPatchHook is not supported yet in ComfyUI.")
+
+class AdditionalModelsHook(Hook):
+    '''
+    Hook responsible for telling model management any additional models that should be loaded.
+
+    Note, value of hook_scope is ignored and is treated as AllConditioning.
+    '''
+    def __init__(self, models: list[ModelPatcher]=None, key: str=None):
+        super().__init__(hook_type=EnumHookType.AdditionalModels)
+        self.models = models
+        self.key = key
+
+    def clone(self):
+        c: AdditionalModelsHook = super().clone()
+        c.models = self.models.copy() if self.models else self.models
+        c.key = self.key
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
+            return False
+        registered.add(self)
+        return True
+
+class TransformerOptionsHook(Hook):
+    '''
+    Hook responsible for adding wrappers, callbacks, patches, or anything else related to transformer_options.
+    '''
+    def __init__(self, transformers_dict: dict[str, dict[str, dict[str, list[Callable]]]]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.TransformerOptions)
+        self.transformers_dict = transformers_dict
+        self.hook_scope = hook_scope
+        self._skip_adding = False
+        '''Internal value used to avoid double load of transformer_options when hook_scope is AllConditioning.'''
+
+    def clone(self):
+        c: TransformerOptionsHook = super().clone()
+        c.transformers_dict = self.transformers_dict
+        c._skip_adding = self._skip_adding
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
+            return False
+        # NOTE: to_load_options will be used to manually load patches/wrappers/callbacks from hooks
+        self._skip_adding = False
+        if self.hook_scope == EnumHookScope.AllConditioning:
+            add_model_options = {"transformer_options": self.transformers_dict,
+                                 "to_load_options": self.transformers_dict}
+            # skip_adding if included in AllConditioning to avoid double loading
+            self._skip_adding = True
+        else:
+            add_model_options = {"to_load_options": self.transformers_dict}
+        registered.add(self)
+        comfy.patcher_extension.merge_nested_dicts(model_options, add_model_options, copy_dict1=False)
+        return True
+
+    def on_apply_hooks(self, model: ModelPatcher, transformer_options: dict[str]):
+        if not self._skip_adding:
+            comfy.patcher_extension.merge_nested_dicts(transformer_options, self.transformers_dict, copy_dict1=False)
+
+WrapperHook = TransformerOptionsHook
+'''Only here for backwards compatibility, WrapperHook is identical to TransformerOptionsHook.'''
+
+class InjectionsHook(Hook):
+    def __init__(self, key: str=None, injections: list[PatcherInjection]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.Injections)
+        self.key = key
+        self.injections = injections
+        self.hook_scope = hook_scope
+
+    def clone(self):
+        c: InjectionsHook = super().clone()
+        c.key = self.key
+        c.injections = self.injections.copy() if self.injections else self.injections
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("InjectionsHook is not supported yet in ComfyUI.")
+
+class HookGroup:
+    '''
+    Stores groups of hooks, and allows them to be queried by type.
+
+    To prevent breaking their functionality, never modify the underlying self.hooks or self._hook_dict vars directly;
+    always use the provided functions on HookGroup.
+    '''
+    def __init__(self):
+        self.hooks: list[Hook] = []
+        self._hook_dict: dict[EnumHookType, list[Hook]] = {}
+
+    def __len__(self):
+        return len(self.hooks)
+
+    def add(self, hook: Hook):
+        if hook not in self.hooks:
+            self.hooks.append(hook)
+            self._hook_dict.setdefault(hook.hook_type, []).append(hook)
+
+    def remove(self, hook: Hook):
+        if hook in self.hooks:
+            self.hooks.remove(hook)
+            self._hook_dict[hook.hook_type].remove(hook)
+
+    def get_type(self, hook_type: EnumHookType):
+        return self._hook_dict.get(hook_type, [])
+
+    def contains(self, hook: Hook):
+        return hook in self.hooks
+
+    def is_subset_of(self, other: HookGroup):
+        self_hooks = set(self.hooks)
+        other_hooks = set(other.hooks)
+        return self_hooks.issubset(other_hooks)
+
+    def new_with_common_hooks(self, other: HookGroup):
+        c = HookGroup()
+        for hook in self.hooks:
+            if other.contains(hook):
+                c.add(hook.clone())
+        return c
+
+    def clone(self):
+        c = HookGroup()
+        for hook in self.hooks:
+            c.add(hook.clone())
+        return c
+
+    def clone_and_combine(self, other: HookGroup):
+        c = self.clone()
+        if other is not None:
+            for hook in other.hooks:
+                c.add(hook.clone())
+        return c
+
+    def set_keyframes_on_hooks(self, hook_kf: HookKeyframeGroup):
+        if hook_kf is None:
+            hook_kf = HookKeyframeGroup()
+        else:
+            hook_kf = hook_kf.clone()
+        for hook in self.hooks:
+            hook.hook_keyframe = hook_kf
+
+    def get_hooks_for_clip_schedule(self):
+        scheduled_hooks: dict[WeightHook, list[tuple[tuple[float,float], HookKeyframe]]] = {}
+        # only care about WeightHooks, for now
+        for hook in self.get_type(EnumHookType.Weight):
+            hook: WeightHook
+            hook_schedule = []
+            # if no hook keyframes, assign default value
+            if len(hook.hook_keyframe.keyframes) == 0:
+                hook_schedule.append(((0.0, 1.0), None))
+                scheduled_hooks[hook] = hook_schedule
+                continue
+            # find ranges of values
+            prev_keyframe = hook.hook_keyframe.keyframes[0]
+            for keyframe in hook.hook_keyframe.keyframes:
+                if keyframe.start_percent > prev_keyframe.start_percent and not math.isclose(keyframe.strength, prev_keyframe.strength):
+                    hook_schedule.append(((prev_keyframe.start_percent, keyframe.start_percent), prev_keyframe))
+                    prev_keyframe = keyframe
+                elif keyframe.start_percent == prev_keyframe.start_percent:
+                    prev_keyframe = keyframe
+            # create final range, assuming last start_percent was not 1.0
+            if not math.isclose(prev_keyframe.start_percent, 1.0):
+                hook_schedule.append(((prev_keyframe.start_percent, 1.0), prev_keyframe))
+            scheduled_hooks[hook] = hook_schedule
+        # hooks should not have their schedules in a list of tuples
+        all_ranges: list[tuple[float, float]] = []
+        for range_kfs in scheduled_hooks.values():
+            for t_range, keyframe in range_kfs:
+                all_ranges.append(t_range)
+        # turn list of ranges into boundaries
+        boundaries_set = set(itertools.chain.from_iterable(all_ranges))
+        boundaries_set.add(0.0)
+        boundaries = sorted(boundaries_set)
+        real_ranges = [(boundaries[i], boundaries[i + 1]) for i in range(len(boundaries) - 1)]
+        # with real ranges defined, give appropriate hooks w/ keyframes for each range
+        scheduled_keyframes: list[tuple[tuple[float,float], list[tuple[WeightHook, HookKeyframe]]]] = []
+        for t_range in real_ranges:
+            hooks_schedule = []
+            for hook, val in scheduled_hooks.items():
+                keyframe = None
+                # check if is a keyframe that works for the current t_range
+                for stored_range, stored_kf in val:
+                    # if stored start is less than current end, then fits - give it assigned keyframe
+                    if stored_range[0] < t_range[1] and stored_range[1] > t_range[0]:
+                        keyframe = stored_kf
+                        break
+                hooks_schedule.append((hook, keyframe))
+            scheduled_keyframes.append((t_range, hooks_schedule))
+        return scheduled_keyframes
+
+    def reset(self):
+        for hook in self.hooks:
+            hook.reset()
+
+    @staticmethod
+    def combine_all_hooks(hooks_list: list[HookGroup], require_count=0) -> HookGroup:
+        actual: list[HookGroup] = []
+        for group in hooks_list:
+            if group is not None:
+                actual.append(group)
+        if len(actual) < require_count:
+            raise Exception(f"Need at least {require_count} hooks to combine, but only had {len(actual)}.")
+        # if no hooks, then return None
+        if len(actual) == 0:
+            return None
+        # if only 1 hook, just return itself without cloning
+        elif len(actual) == 1:
+            return actual[0]
+        final_hook: HookGroup = None
+        for hook in actual:
+            if final_hook is None:
+                final_hook = hook.clone()
+            else:
+                final_hook = final_hook.clone_and_combine(hook)
+        return final_hook
+
+
+class HookKeyframe:
+    def __init__(self, strength: float, start_percent=0.0, guarantee_steps=1):
+        self.strength = strength
+        # scheduling
+        self.start_percent = float(start_percent)
+        self.start_t = 999999999.9
+        self.guarantee_steps = guarantee_steps
+
+    def get_effective_guarantee_steps(self, max_sigma: torch.Tensor):
+        '''If keyframe starts before current sampling range (max_sigma), treat as 0.'''
+        if self.start_t > max_sigma:
+            return 0
+        return self.guarantee_steps
+
+    def clone(self):
+        c = HookKeyframe(strength=self.strength,
+                         start_percent=self.start_percent, guarantee_steps=self.guarantee_steps)
+        c.start_t = self.start_t
+        return c
+
+class HookKeyframeGroup:
+    def __init__(self):
+        self.keyframes: list[HookKeyframe] = []
+        self._current_keyframe: HookKeyframe = None
+        self._current_used_steps = 0
+        self._current_index = 0
+        self._current_strength = None
+        self._curr_t = -1.
+
+    # properties shadow those of HookWeightsKeyframe
+    @property
+    def strength(self):
+        if self._current_keyframe is not None:
+            return self._current_keyframe.strength
+        return 1.0
+
+    def reset(self):
+        self._current_keyframe = None
+        self._current_used_steps = 0
+        self._current_index = 0
+        self._current_strength = None
+        self.curr_t = -1.
+        self._set_first_as_current()
+
+    def add(self, keyframe: HookKeyframe):
+        # add to end of list, then sort
+        self.keyframes.append(keyframe)
+        self.keyframes = get_sorted_list_via_attr(self.keyframes, "start_percent")
+        self._set_first_as_current()
+
+    def _set_first_as_current(self):
+        if len(self.keyframes) > 0:
+            self._current_keyframe = self.keyframes[0]
+        else:
+            self._current_keyframe = None
+
+    def has_guarantee_steps(self):
+        for kf in self.keyframes:
+            if kf.guarantee_steps > 0:
+                return True
+        return False
+
+    def has_index(self, index: int):
+        return index >= 0 and index < len(self.keyframes)
+
+    def is_empty(self):
+        return len(self.keyframes) == 0
+
+    def clone(self):
+        c = HookKeyframeGroup()
+        for keyframe in self.keyframes:
+            c.keyframes.append(keyframe.clone())
+        c._set_first_as_current()
+        return c
+
+    def initialize_timesteps(self, model: BaseModel):
+        for keyframe in self.keyframes:
+            keyframe.start_t = model.model_sampling.percent_to_sigma(keyframe.start_percent)
+
+    def prepare_current_keyframe(self, curr_t: float, transformer_options: dict[str, torch.Tensor]) -> bool:
+        if self.is_empty():
+            return False
+        if curr_t == self._curr_t:
+            return False
+        max_sigma = torch.max(transformer_options["sample_sigmas"])
+        prev_index = self._current_index
+        prev_strength = self._current_strength
+        # if met guaranteed steps, look for next keyframe in case need to switch
+        if self._current_used_steps >= self._current_keyframe.get_effective_guarantee_steps(max_sigma):
+            # if has next index, loop through and see if need to switch
+            if self.has_index(self._current_index+1):
+                for i in range(self._current_index+1, len(self.keyframes)):
+                    eval_c = self.keyframes[i]
+                    # check if start_t is greater or equal to curr_t
+                    # NOTE: t is in terms of sigmas, not percent, so bigger number = earlier step in sampling
+                    if eval_c.start_t >= curr_t:
+                        self._current_index = i
+                        self._current_strength = eval_c.strength
+                        self._current_keyframe = eval_c
+                        self._current_used_steps = 0
+                        # if guarantee_steps greater than zero, stop searching for other keyframes
+                        if self._current_keyframe.get_effective_guarantee_steps(max_sigma) > 0:
+                            break
+                    # if eval_c is outside the percent range, stop looking further
+                    else: break
+        # update steps current context is used
+        self._current_used_steps += 1
+        # update current timestep this was performed on
+        self._curr_t = curr_t
+        # return True if keyframe changed, False if no change
+        return prev_index != self._current_index and prev_strength != self._current_strength
+
+
+class InterpolationMethod:
+    LINEAR = "linear"
+    EASE_IN = "ease_in"
+    EASE_OUT = "ease_out"
+    EASE_IN_OUT = "ease_in_out"
+
+    _LIST = [LINEAR, EASE_IN, EASE_OUT, EASE_IN_OUT]
+
+    @classmethod
+    def get_weights(cls, num_from: float, num_to: float, length: int, method: str, reverse=False):
+        diff = num_to - num_from
+        if method == cls.LINEAR:
+            weights = torch.linspace(num_from, num_to, length)
+        elif method == cls.EASE_IN:
+            index = torch.linspace(0, 1, length)
+            weights = diff * np.power(index, 2) + num_from
+        elif method == cls.EASE_OUT:
+            index = torch.linspace(0, 1, length)
+            weights = diff * (1 - np.power(1 - index, 2)) + num_from
+        elif method == cls.EASE_IN_OUT:
+            index = torch.linspace(0, 1, length)
+            weights = diff * ((1 - np.cos(index * np.pi)) / 2) + num_from
+        else:
+            raise ValueError(f"Unrecognized interpolation method '{method}'.")
+        if reverse:
+            weights = weights.flip(dims=(0,))
+        return weights
+
+def get_sorted_list_via_attr(objects: list, attr: str) -> list:
+    if not objects:
+        return objects
+    elif len(objects) <= 1:
+        return [x for x in objects]
+    # now that we know we have to sort, do it following these rules:
+    # a) if objects have same value of attribute, maintain their relative order
+    # b) perform sorting of the groups of objects with same attributes
+    unique_attrs = {}
+    for o in objects:
+        val_attr = getattr(o, attr)
+        attr_list: list = unique_attrs.get(val_attr, list())
+        attr_list.append(o)
+        if val_attr not in unique_attrs:
+            unique_attrs[val_attr] = attr_list
+    # now that we have the unique attr values grouped together in relative order, sort them by key
+    sorted_attrs = dict(sorted(unique_attrs.items()))
+    # now flatten out the dict into a list to return
+    sorted_list = []
+    for object_list in sorted_attrs.values():
+        sorted_list.extend(object_list)
+    return sorted_list
+
+def create_transformer_options_from_hooks(model: ModelPatcher, hooks: HookGroup,  transformer_options: dict[str]=None):
+    # if no hooks or is not a ModelPatcher for sampling, return empty dict
+    if hooks is None or model.is_clip:
+        return {}
+    if transformer_options is None:
+        transformer_options = {}
+    for hook in hooks.get_type(EnumHookType.TransformerOptions):
+        hook: TransformerOptionsHook
+        hook.on_apply_hooks(model, transformer_options)
+    return transformer_options
+
+def create_hook_lora(lora: dict[str, torch.Tensor], strength_model: float, strength_clip: float):
+    hook_group = HookGroup()
+    hook = WeightHook(strength_model=strength_model, strength_clip=strength_clip)
+    hook_group.add(hook)
+    hook.weights = lora
+    return hook_group
+
+def create_hook_model_as_lora(weights_model, weights_clip, strength_model: float, strength_clip: float):
+    hook_group = HookGroup()
+    hook = WeightHook(strength_model=strength_model, strength_clip=strength_clip)
+    hook_group.add(hook)
+    patches_model = None
+    patches_clip = None
+    if weights_model is not None:
+        patches_model = {}
+        for key in weights_model:
+            patches_model[key] = ("model_as_lora", (weights_model[key],))
+    if weights_clip is not None:
+        patches_clip = {}
+        for key in weights_clip:
+            patches_clip[key] = ("model_as_lora", (weights_clip[key],))
+    hook.weights = patches_model
+    hook.weights_clip = patches_clip
+    hook.need_weight_init = False
+    return hook_group
+
+def get_patch_weights_from_model(model: ModelPatcher, discard_model_sampling=True):
+    if model is None:
+        return None
+    patches_model: dict[str, torch.Tensor] = model.model.state_dict()
+    if discard_model_sampling:
+        # do not include ANY model_sampling components of the model that should act as a patch
+        for key in list(patches_model.keys()):
+            if key.startswith("model_sampling"):
+                patches_model.pop(key, None)
+    return patches_model
+
+# NOTE: this function shows how to register weight hooks directly on the ModelPatchers
+def load_hook_lora_for_models(model: ModelPatcher, clip: CLIP, lora: dict[str, torch.Tensor],
+                              strength_model: float, strength_clip: float):
+    key_map = {}
+    if model is not None:
+        key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
+    if clip is not None:
+        key_map = comfy.lora.model_lora_keys_clip(clip.cond_stage_model, key_map)
+
+    hook_group = HookGroup()
+    hook = WeightHook()
+    hook_group.add(hook)
+    loaded: dict[str] = comfy.lora.load_lora(lora, key_map)
+    if model is not None:
+        new_modelpatcher = model.clone()
+        k = new_modelpatcher.add_hook_patches(hook=hook, patches=loaded, strength_patch=strength_model)
+    else:
+        k = ()
+        new_modelpatcher = None
+
+    if clip is not None:
+        new_clip = clip.clone()
+        k1 = new_clip.patcher.add_hook_patches(hook=hook, patches=loaded, strength_patch=strength_clip)
+    else:
+        k1 = ()
+        new_clip = None
+    k = set(k)
+    k1 = set(k1)
+    for x in loaded:
+        if (x not in k) and (x not in k1):
+            logging.warning(f"NOT LOADED {x}")
+    return (new_modelpatcher, new_clip, hook_group)
+
+def _combine_hooks_from_values(c_dict: dict[str, HookGroup], values: dict[str, HookGroup], cache: dict[tuple[HookGroup, HookGroup], HookGroup]):
+    hooks_key = 'hooks'
+    # if hooks only exist in one dict, do what's needed so that it ends up in c_dict
+    if hooks_key not in values:
+        return
+    if hooks_key not in c_dict:
+        hooks_value = values.get(hooks_key, None)
+        if hooks_value is not None:
+            c_dict[hooks_key] = hooks_value
+        return
+    # otherwise, need to combine with minimum duplication via cache
+    hooks_tuple = (c_dict[hooks_key], values[hooks_key])
+    cached_hooks = cache.get(hooks_tuple, None)
+    if cached_hooks is None:
+        new_hooks = hooks_tuple[0].clone_and_combine(hooks_tuple[1])
+        cache[hooks_tuple] = new_hooks
+        c_dict[hooks_key] = new_hooks
+    else:
+        c_dict[hooks_key] = cache[hooks_tuple]
+
+def conditioning_set_values_with_hooks(conditioning, values={}, append_hooks=True,
+                                       cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
+    c = []
+    if cache is None:
+        cache = {}
+    for t in conditioning:
+        n = [t[0], t[1].copy()]
+        for k in values:
+            if append_hooks and k == 'hooks':
+                _combine_hooks_from_values(n[1], values, cache)
+            else:
+                n[1][k] = values[k]
+        c.append(n)
+
+    return c
+
+def set_hooks_for_conditioning(cond, hooks: HookGroup, append_hooks=True, cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
+    if hooks is None:
+        return cond
+    return conditioning_set_values_with_hooks(cond, {'hooks': hooks}, append_hooks=append_hooks, cache=cache)
+
+def set_timesteps_for_conditioning(cond, timestep_range: tuple[float,float]):
+    if timestep_range is None:
+        return cond
+    return conditioning_set_values(cond, {"start_percent": timestep_range[0],
+                                          "end_percent": timestep_range[1]})
+
+def set_mask_for_conditioning(cond, mask: torch.Tensor, set_cond_area: str, strength: float):
+    if mask is None:
+        return cond
+    set_area_to_bounds = False
+    if set_cond_area != 'default':
+        set_area_to_bounds = True
+    if len(mask.shape) < 3:
+        mask = mask.unsqueeze(0)
+    return conditioning_set_values(cond, {'mask': mask,
+                                          'set_area_to_bounds': set_area_to_bounds,
+                                          'mask_strength': strength})
+
+def combine_conditioning(conds: list):
+    combined_conds = []
+    for cond in conds:
+        combined_conds.extend(cond)
+    return combined_conds
+
+def combine_with_new_conds(conds: list, new_conds: list):
+    combined_conds = []
+    for c, new_c in zip(conds, new_conds):
+        combined_conds.append(combine_conditioning([c, new_c]))
+    return combined_conds
+
+def set_conds_props(conds: list, strength: float, set_cond_area: str,
+                   mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
+    final_conds = []
+    cache = {}
+    for c in conds:
+        # first, apply lora_hook to conditioning, if provided
+        c = set_hooks_for_conditioning(c, hooks, append_hooks=append_hooks, cache=cache)
+        # next, apply mask to conditioning
+        c = set_mask_for_conditioning(cond=c, mask=mask, strength=strength, set_cond_area=set_cond_area)
+        # apply timesteps, if present
+        c = set_timesteps_for_conditioning(cond=c, timestep_range=timesteps_range)
+        # finally, apply mask to conditioning and store
+        final_conds.append(c)
+    return final_conds
+
+def set_conds_props_and_combine(conds: list, new_conds: list, strength: float=1.0, set_cond_area: str="default",
+                               mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
+    combined_conds = []
+    cache = {}
+    for c, masked_c in zip(conds, new_conds):
+        # first, apply lora_hook to new conditioning, if provided
+        masked_c = set_hooks_for_conditioning(masked_c, hooks, append_hooks=append_hooks, cache=cache)
+        # next, apply mask to new conditioning, if provided
+        masked_c = set_mask_for_conditioning(cond=masked_c, mask=mask, set_cond_area=set_cond_area, strength=strength)
+        # apply timesteps, if present
+        masked_c = set_timesteps_for_conditioning(cond=masked_c, timestep_range=timesteps_range)
+        # finally, combine with existing conditioning and store
+        combined_conds.append(combine_conditioning([c, masked_c]))
+    return combined_conds
+
+def set_default_conds_and_combine(conds: list, new_conds: list,
+                                   hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
+    combined_conds = []
+    cache = {}
+    for c, new_c in zip(conds, new_conds):
+        # first, apply lora_hook to new conditioning, if provided
+        new_c = set_hooks_for_conditioning(new_c, hooks, append_hooks=append_hooks, cache=cache)
+        # next, add default_cond key to cond so that during sampling, it can be identified
+        new_c = conditioning_set_values(new_c, {'default': True})
+        # apply timesteps, if present
+        new_c = set_timesteps_for_conditioning(cond=new_c, timestep_range=timesteps_range)
+        # finally, combine with existing conditioning and store
+        combined_conds.append(combine_conditioning([c, new_c]))
+    return combined_conds
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@@ -0,0 +1,141 @@
+import torch
+from comfy.text_encoders.bert import BertAttention
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention_for_device
+
+
+class Dino2AttentionOutput(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return self.dense(x)
+
+
+class Dino2AttentionBlock(torch.nn.Module):
+    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
+        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+
+    def forward(self, x, mask, optimized_attention):
+        return self.output(self.attention(x, mask, optimized_attention))
+
+
+class LayerScale(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        self.lambda1 = torch.nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+
+    def forward(self, x):
+        return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
+
+
+class SwiGLUFFN(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        in_features = out_features = dim
+        hidden_features = int(dim * 4)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = operations.Linear(in_features, 2 * hidden_features, bias=True, device=device, dtype=dtype)
+        self.weights_out = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.weights_in(x)
+        x1, x2 = x.chunk(2, dim=-1)
+        x = torch.nn.functional.silu(x1) * x2
+        return self.weights_out(x)
+
+
+class Dino2Block(torch.nn.Module):
+    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
+        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
+        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
+        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
+        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, x, optimized_attention):
+        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
+        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
+        return x
+
+
+class Dino2Encoder(torch.nn.Module):
+    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
+        super().__init__()
+        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
+
+    def forward(self, x, intermediate_output=None):
+        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
+
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layer) + intermediate_output
+
+        intermediate = None
+        for i, l in enumerate(self.layer):
+            x = l(x, optimized_attention)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+
+class Dino2PatchEmbeddings(torch.nn.Module):
+    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.projection = operations.Conv2d(
+            in_channels=num_channels,
+            out_channels=dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=True,
+            dtype=dtype,
+            device=device
+        )
+
+    def forward(self, pixel_values):
+        return self.projection(pixel_values).flatten(2).transpose(1, 2)
+
+
+class Dino2Embeddings(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        patch_size = 14
+        image_size = 518
+
+        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
+        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
+        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
+        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
+
+    def forward(self, pixel_values):
+        x = self.patch_embeddings(pixel_values)
+        # TODO: mask_token?
+        x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
+        return x
+
+
+class Dinov2Model(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        num_layers = config_dict["num_hidden_layers"]
+        dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        layer_norm_eps = config_dict["layer_norm_eps"]
+
+        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
+        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
+        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
+        x = self.embeddings(pixel_values)
+        x, i = self.encoder(x, intermediate_output=intermediate_output)
+        x = self.layernorm(x)
+        pooled_output = x[:, 0, :]
+        return x, i, pooled_output, None
--- a/comfy/image_encoders/dino2_giant.json
+++ b/comfy/image_encoders/dino2_giant.json
@@ -0,0 +1,21 @@
+{
+  "attention_probs_dropout_prob": 0.0,
+  "drop_path_rate": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1536,
+  "image_size": 518,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-06,
+  "layerscale_value": 1.0,
+  "mlp_ratio": 4,
+  "model_type": "dinov2",
+  "num_attention_heads": 24,
+  "num_channels": 3,
+  "num_hidden_layers": 40,
+  "patch_size": 14,
+  "qkv_bias": true,
+  "use_swiglu_ffn": true,
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std": [0.229, 0.224, 0.225]
+}
--- a/comfy/k_diffusion/augmentation.py
+++ b/comfy/k_diffusion/augmentation.py
@@ -1,105 +0,0 @@
-from functools import reduce
-import math
-import operator
-
-import numpy as np
-from skimage import transform
-import torch
-from torch import nn
-
-
-def translate2d(tx, ty):
-    mat = [[1, 0, tx],
-           [0, 1, ty],
-           [0, 0,  1]]
-    return torch.tensor(mat, dtype=torch.float32)
-
-
-def scale2d(sx, sy):
-    mat = [[sx,  0, 0],
-           [ 0, sy, 0],
-           [ 0,  0, 1]]
-    return torch.tensor(mat, dtype=torch.float32)
-
-
-def rotate2d(theta):
-    mat = [[torch.cos(theta), torch.sin(-theta), 0],
-           [torch.sin(theta),  torch.cos(theta), 0],
-           [               0,                 0, 1]]
-    return torch.tensor(mat, dtype=torch.float32)
-
-
-class KarrasAugmentationPipeline:
-    def __init__(self, a_prob=0.12, a_scale=2**0.2, a_aniso=2**0.2, a_trans=1/8):
-        self.a_prob = a_prob
-        self.a_scale = a_scale
-        self.a_aniso = a_aniso
-        self.a_trans = a_trans
-
-    def __call__(self, image):
-        h, w = image.size
-        mats = [translate2d(h / 2 - 0.5, w / 2 - 0.5)]
-
-        # x-flip
-        a0 = torch.randint(2, []).float()
-        mats.append(scale2d(1 - 2 * a0, 1))
-        # y-flip
-        do = (torch.rand([]) < self.a_prob).float()
-        a1 = torch.randint(2, []).float() * do
-        mats.append(scale2d(1, 1 - 2 * a1))
-        # scaling
-        do = (torch.rand([]) < self.a_prob).float()
-        a2 = torch.randn([]) * do
-        mats.append(scale2d(self.a_scale ** a2, self.a_scale ** a2))
-        # rotation
-        do = (torch.rand([]) < self.a_prob).float()
-        a3 = (torch.rand([]) * 2 * math.pi - math.pi) * do
-        mats.append(rotate2d(-a3))
-        # anisotropy
-        do = (torch.rand([]) < self.a_prob).float()
-        a4 = (torch.rand([]) * 2 * math.pi - math.pi) * do
-        a5 = torch.randn([]) * do
-        mats.append(rotate2d(a4))
-        mats.append(scale2d(self.a_aniso ** a5, self.a_aniso ** -a5))
-        mats.append(rotate2d(-a4))
-        # translation
-        do = (torch.rand([]) < self.a_prob).float()
-        a6 = torch.randn([]) * do
-        a7 = torch.randn([]) * do
-        mats.append(translate2d(self.a_trans * w * a6, self.a_trans * h * a7))
-
-        # form the transformation matrix and conditioning vector
-        mats.append(translate2d(-h / 2 + 0.5, -w / 2 + 0.5))
-        mat = reduce(operator.matmul, mats)
-        cond = torch.stack([a0, a1, a2, a3.cos() - 1, a3.sin(), a5 * a4.cos(), a5 * a4.sin(), a6, a7])
-
-        # apply the transformation
-        image_orig = np.array(image, dtype=np.float32) / 255
-        if image_orig.ndim == 2:
-            image_orig = image_orig[..., None]
-        tf = transform.AffineTransform(mat.numpy())
-        image = transform.warp(image_orig, tf.inverse, order=3, mode='reflect', cval=0.5, clip=False, preserve_range=True)
-        image_orig = torch.as_tensor(image_orig).movedim(2, 0) * 2 - 1
-        image = torch.as_tensor(image).movedim(2, 0) * 2 - 1
-        return image, image_orig, cond
-
-
-class KarrasAugmentWrapper(nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.inner_model = model
-    
-    def forward(self, input, sigma, aug_cond=None, mapping_cond=None, **kwargs):
-        if aug_cond is None:
-            aug_cond = input.new_zeros([input.shape[0], 9])
-        if mapping_cond is None:
-            mapping_cond = aug_cond
-        else:
-            mapping_cond = torch.cat([aug_cond, mapping_cond], dim=1)
-        return self.inner_model(input, sigma, mapping_cond=mapping_cond, **kwargs)
-
-    def set_skip_stages(self, skip_stages):
-        return self.inner_model.set_skip_stages(skip_stages)
-
-    def set_patch_size(self, patch_size):
-        return self.inner_model.set_patch_size(patch_size)
--- a/comfy/k_diffusion/config.py
+++ b/comfy/k_diffusion/config.py
@@ -1,110 +0,0 @@
-from functools import partial
-import json
-import math
-import warnings
-
-from jsonmerge import merge
-
-from . import augmentation, layers, models, utils
-
-
-def load_config(file):
-    defaults = {
-        'model': {
-            'sigma_data': 1.,
-            'patch_size': 1,
-            'dropout_rate': 0.,
-            'augment_wrapper': True,
-            'augment_prob': 0.,
-            'mapping_cond_dim': 0,
-            'unet_cond_dim': 0,
-            'cross_cond_dim': 0,
-            'cross_attn_depths': None,
-            'skip_stages': 0,
-            'has_variance': False,
-        },
-        'dataset': {
-            'type': 'imagefolder',
-        },
-        'optimizer': {
-            'type': 'adamw',
-            'lr': 1e-4,
-            'betas': [0.95, 0.999],
-            'eps': 1e-6,
-            'weight_decay': 1e-3,
-        },
-        'lr_sched': {
-            'type': 'inverse',
-            'inv_gamma': 20000.,
-            'power': 1.,
-            'warmup': 0.99,
-        },
-        'ema_sched': {
-            'type': 'inverse',
-            'power': 0.6667,
-            'max_value': 0.9999
-        },
-    }
-    config = json.load(file)
-    return merge(defaults, config)
-
-
-def make_model(config):
-    config = config['model']
-    assert config['type'] == 'image_v1'
-    model = models.ImageDenoiserModelV1(
-        config['input_channels'],
-        config['mapping_out'],
-        config['depths'],
-        config['channels'],
-        config['self_attn_depths'],
-        config['cross_attn_depths'],
-        patch_size=config['patch_size'],
-        dropout_rate=config['dropout_rate'],
-        mapping_cond_dim=config['mapping_cond_dim'] + (9 if config['augment_wrapper'] else 0),
-        unet_cond_dim=config['unet_cond_dim'],
-        cross_cond_dim=config['cross_cond_dim'],
-        skip_stages=config['skip_stages'],
-        has_variance=config['has_variance'],
-    )
-    if config['augment_wrapper']:
-        model = augmentation.KarrasAugmentWrapper(model)
-    return model
-
-
-def make_denoiser_wrapper(config):
-    config = config['model']
-    sigma_data = config.get('sigma_data', 1.)
-    has_variance = config.get('has_variance', False)
-    if not has_variance:
-        return partial(layers.Denoiser, sigma_data=sigma_data)
-    return partial(layers.DenoiserWithVariance, sigma_data=sigma_data)
-
-
-def make_sample_density(config):
-    sd_config = config['sigma_sample_density']
-    sigma_data = config['sigma_data']
-    if sd_config['type'] == 'lognormal':
-        loc = sd_config['mean'] if 'mean' in sd_config else sd_config['loc']
-        scale = sd_config['std'] if 'std' in sd_config else sd_config['scale']
-        return partial(utils.rand_log_normal, loc=loc, scale=scale)
-    if sd_config['type'] == 'loglogistic':
-        loc = sd_config['loc'] if 'loc' in sd_config else math.log(sigma_data)
-        scale = sd_config['scale'] if 'scale' in sd_config else 0.5
-        min_value = sd_config['min_value'] if 'min_value' in sd_config else 0.
-        max_value = sd_config['max_value'] if 'max_value' in sd_config else float('inf')
-        return partial(utils.rand_log_logistic, loc=loc, scale=scale, min_value=min_value, max_value=max_value)
-    if sd_config['type'] == 'loguniform':
-        min_value = sd_config['min_value'] if 'min_value' in sd_config else config['sigma_min']
-        max_value = sd_config['max_value'] if 'max_value' in sd_config else config['sigma_max']
-        return partial(utils.rand_log_uniform, min_value=min_value, max_value=max_value)
-    if sd_config['type'] == 'v-diffusion':
-        min_value = sd_config['min_value'] if 'min_value' in sd_config else 0.
-        max_value = sd_config['max_value'] if 'max_value' in sd_config else float('inf')
-        return partial(utils.rand_v_diffusion, sigma_data=sigma_data, min_value=min_value, max_value=max_value)
-    if sd_config['type'] == 'split-lognormal':
-        loc = sd_config['mean'] if 'mean' in sd_config else sd_config['loc']
-        scale_1 = sd_config['std_1'] if 'std_1' in sd_config else sd_config['scale_1']
-        scale_2 = sd_config['std_2'] if 'std_2' in sd_config else sd_config['scale_2']
-        return partial(utils.rand_split_log_normal, loc=loc, scale_1=scale_1, scale_2=scale_2)
-    raise ValueError('Unknown sample density type')
--- a/comfy/k_diffusion/deis.py
+++ b/comfy/k_diffusion/deis.py
@@ -0,0 +1,120 @@
+#Taken from: https://github.com/zju-pi/diff-sampler/blob/main/gits-main/solver_utils.py
+#under Apache 2 license
+import torch
+import numpy as np
+
+# A pytorch reimplementation of DEIS (https://github.com/qsh-zh/deis).
+#############################
+### Utils for DEIS solver ###
+#############################
+#----------------------------------------------------------------------------
+# Transfer from the input time (sigma) used in EDM to that (t) used in DEIS.
+
+def edm2t(edm_steps, epsilon_s=1e-3, sigma_min=0.002, sigma_max=80):
+    vp_sigma_inv = lambda beta_d, beta_min: lambda sigma: ((beta_min ** 2 + 2 * beta_d * (sigma ** 2 + 1).log()).sqrt() - beta_min) / beta_d
+    vp_beta_d = 2 * (np.log(torch.tensor(sigma_min).cpu() ** 2 + 1) / epsilon_s - np.log(torch.tensor(sigma_max).cpu() ** 2 + 1)) / (epsilon_s - 1)
+    vp_beta_min = np.log(torch.tensor(sigma_max).cpu() ** 2 + 1) - 0.5 * vp_beta_d
+    t_steps = vp_sigma_inv(vp_beta_d.clone().detach().cpu(), vp_beta_min.clone().detach().cpu())(edm_steps.clone().detach().cpu())
+    return t_steps, vp_beta_min, vp_beta_d + vp_beta_min
+
+#----------------------------------------------------------------------------
+
+def cal_poly(prev_t, j, taus):
+    poly = 1
+    for k in range(prev_t.shape[0]):
+        if k == j:
+            continue
+        poly *= (taus - prev_t[k]) / (prev_t[j] - prev_t[k])
+    return poly
+
+#----------------------------------------------------------------------------
+# Transfer from t to alpha_t.
+
+def t2alpha_fn(beta_0, beta_1, t):
+    return torch.exp(-0.5 * t ** 2 * (beta_1 - beta_0) - t * beta_0)
+
+#----------------------------------------------------------------------------
+
+def cal_intergrand(beta_0, beta_1, taus):
+    with torch.inference_mode(mode=False):
+        taus = taus.clone()
+        beta_0 = beta_0.clone()
+        beta_1 = beta_1.clone()
+        with torch.enable_grad():
+            taus.requires_grad_(True)
+            alpha = t2alpha_fn(beta_0, beta_1, taus)
+            log_alpha = alpha.log()
+            log_alpha.sum().backward()
+            d_log_alpha_dtau = taus.grad
+    integrand = -0.5 * d_log_alpha_dtau / torch.sqrt(alpha * (1 - alpha))
+    return integrand
+
+#----------------------------------------------------------------------------
+
+def get_deis_coeff_list(t_steps, max_order, N=10000, deis_mode='tab'):
+    """
+    Get the coefficient list for DEIS sampling.
+
+    Args:
+        t_steps: A pytorch tensor. The time steps for sampling.
+        max_order: A `int`. Maximum order of the solver. 1 <= max_order <= 4
+        N: A `int`. Use how many points to perform the numerical integration when deis_mode=='tab'.
+        deis_mode: A `str`. Select between 'tab' and 'rhoab'. Type of DEIS.
+    Returns:
+        A pytorch tensor. A batch of generated samples or sampling trajectories if return_inters=True.
+    """
+    if deis_mode == 'tab':
+        t_steps, beta_0, beta_1 = edm2t(t_steps)
+        C = []
+        for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])):
+            order = min(i+1, max_order)
+            if order == 1:
+                C.append([])
+            else:
+                taus = torch.linspace(t_cur, t_next, N)   # split the interval for integral appximation
+                dtau = (t_next - t_cur) / N
+                prev_t = t_steps[[i - k for k in range(order)]]
+                coeff_temp = []
+                integrand = cal_intergrand(beta_0, beta_1, taus)
+                for j in range(order):
+                    poly = cal_poly(prev_t, j, taus)
+                    coeff_temp.append(torch.sum(integrand * poly) * dtau)
+                C.append(coeff_temp)
+
+    elif deis_mode == 'rhoab':
+        # Analytical solution, second order
+        def get_def_intergral_2(a, b, start, end, c):
+            coeff = (end**3 - start**3) / 3 - (end**2 - start**2) * (a + b) / 2 + (end - start) * a * b
+            return coeff / ((c - a) * (c - b))
+
+        # Analytical solution, third order
+        def get_def_intergral_3(a, b, c, start, end, d):
+            coeff = (end**4 - start**4) / 4 - (end**3 - start**3) * (a + b + c) / 3 \
+                    + (end**2 - start**2) * (a*b + a*c + b*c) / 2 - (end - start) * a * b * c
+            return coeff / ((d - a) * (d - b) * (d - c))
+
+        C = []
+        for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])):
+            order = min(i, max_order)
+            if order == 0:
+                C.append([])
+            else:
+                prev_t = t_steps[[i - k for k in range(order+1)]]
+                if order == 1:
+                    coeff_cur = ((t_next - prev_t[1])**2 - (t_cur - prev_t[1])**2) / (2 * (t_cur - prev_t[1]))
+                    coeff_prev1 = (t_next - t_cur)**2 / (2 * (prev_t[1] - t_cur))
+                    coeff_temp = [coeff_cur, coeff_prev1]
+                elif order == 2:
+                    coeff_cur = get_def_intergral_2(prev_t[1], prev_t[2], t_cur, t_next, t_cur)
+                    coeff_prev1 = get_def_intergral_2(t_cur, prev_t[2], t_cur, t_next, prev_t[1])
+                    coeff_prev2 = get_def_intergral_2(t_cur, prev_t[1], t_cur, t_next, prev_t[2])
+                    coeff_temp = [coeff_cur, coeff_prev1, coeff_prev2]
+                elif order == 3:
+                    coeff_cur = get_def_intergral_3(prev_t[1], prev_t[2], prev_t[3], t_cur, t_next, t_cur)
+                    coeff_prev1 = get_def_intergral_3(t_cur, prev_t[2], prev_t[3], t_cur, t_next, prev_t[1])
+                    coeff_prev2 = get_def_intergral_3(t_cur, prev_t[1], prev_t[3], t_cur, t_next, prev_t[2])
+                    coeff_prev3 = get_def_intergral_3(t_cur, prev_t[1], prev_t[2], t_cur, t_next, prev_t[3])
+                    coeff_temp = [coeff_cur, coeff_prev1, coeff_prev2, coeff_prev3]
+                C.append(coeff_temp)
+    return C
+
--- a/comfy/k_diffusion/evaluation.py
+++ b/comfy/k_diffusion/evaluation.py
@@ -1,134 +0,0 @@
-import math
-import os
-from pathlib import Path
-
-from cleanfid.inception_torchscript import InceptionV3W
-import clip
-from resize_right import resize
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torchvision import transforms
-from tqdm.auto import trange
-
-from . import utils
-
-
-class InceptionV3FeatureExtractor(nn.Module):
-    def __init__(self, device='cpu'):
-        super().__init__()
-        path = Path(os.environ.get('XDG_CACHE_HOME', Path.home() / '.cache')) / 'k-diffusion'
-        url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/inception-2015-12-05.pt'
-        digest = 'f58cb9b6ec323ed63459aa4fb441fe750cfe39fafad6da5cb504a16f19e958f4'
-        utils.download_file(path / 'inception-2015-12-05.pt', url, digest)
-        self.model = InceptionV3W(str(path), resize_inside=False).to(device)
-        self.size = (299, 299)
-
-    def forward(self, x):
-        if x.shape[2:4] != self.size:
-            x = resize(x, out_shape=self.size, pad_mode='reflect')
-        if x.shape[1] == 1:
-            x = torch.cat([x] * 3, dim=1)
-        x = (x * 127.5 + 127.5).clamp(0, 255)
-        return self.model(x)
-
-
-class CLIPFeatureExtractor(nn.Module):
-    def __init__(self, name='ViT-L/14@336px', device='cpu'):
-        super().__init__()
-        self.model = clip.load(name, device=device)[0].eval().requires_grad_(False)
-        self.normalize = transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
-                                              std=(0.26862954, 0.26130258, 0.27577711))
-        self.size = (self.model.visual.input_resolution, self.model.visual.input_resolution)
-
-    def forward(self, x):
-        if x.shape[2:4] != self.size:
-            x = resize(x.add(1).div(2), out_shape=self.size, pad_mode='reflect').clamp(0, 1)
-        x = self.normalize(x)
-        x = self.model.encode_image(x).float()
-        x = F.normalize(x) * x.shape[1] ** 0.5
-        return x
-
-
-def compute_features(accelerator, sample_fn, extractor_fn, n, batch_size):
-    n_per_proc = math.ceil(n / accelerator.num_processes)
-    feats_all = []
-    try:
-        for i in trange(0, n_per_proc, batch_size, disable=not accelerator.is_main_process):
-            cur_batch_size = min(n - i, batch_size)
-            samples = sample_fn(cur_batch_size)[:cur_batch_size]
-            feats_all.append(accelerator.gather(extractor_fn(samples)))
-    except StopIteration:
-        pass
-    return torch.cat(feats_all)[:n]
-
-
-def polynomial_kernel(x, y):
-    d = x.shape[-1]
-    dot = x @ y.transpose(-2, -1)
-    return (dot / d + 1) ** 3
-
-
-def squared_mmd(x, y, kernel=polynomial_kernel):
-    m = x.shape[-2]
-    n = y.shape[-2]
-    kxx = kernel(x, x)
-    kyy = kernel(y, y)
-    kxy = kernel(x, y)
-    kxx_sum = kxx.sum([-1, -2]) - kxx.diagonal(dim1=-1, dim2=-2).sum(-1)
-    kyy_sum = kyy.sum([-1, -2]) - kyy.diagonal(dim1=-1, dim2=-2).sum(-1)
-    kxy_sum = kxy.sum([-1, -2])
-    term_1 = kxx_sum / m / (m - 1)
-    term_2 = kyy_sum / n / (n - 1)
-    term_3 = kxy_sum * 2 / m / n
-    return term_1 + term_2 - term_3
-
-
-@utils.tf32_mode(matmul=False)
-def kid(x, y, max_size=5000):
-    x_size, y_size = x.shape[0], y.shape[0]
-    n_partitions = math.ceil(max(x_size / max_size, y_size / max_size))
-    total_mmd = x.new_zeros([])
-    for i in range(n_partitions):
-        cur_x = x[round(i * x_size / n_partitions):round((i + 1) * x_size / n_partitions)]
-        cur_y = y[round(i * y_size / n_partitions):round((i + 1) * y_size / n_partitions)]
-        total_mmd = total_mmd + squared_mmd(cur_x, cur_y)
-    return total_mmd / n_partitions
-
-
-class _MatrixSquareRootEig(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, a):
-        vals, vecs = torch.linalg.eigh(a)
-        ctx.save_for_backward(vals, vecs)
-        return vecs @ vals.abs().sqrt().diag_embed() @ vecs.transpose(-2, -1)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        vals, vecs = ctx.saved_tensors
-        d = vals.abs().sqrt().unsqueeze(-1).repeat_interleave(vals.shape[-1], -1)
-        vecs_t = vecs.transpose(-2, -1)
-        return vecs @ (vecs_t @ grad_output @ vecs / (d + d.transpose(-2, -1))) @ vecs_t
-
-
-def sqrtm_eig(a):
-    if a.ndim < 2:
-        raise RuntimeError('tensor of matrices must have at least 2 dimensions')
-    if a.shape[-2] != a.shape[-1]:
-        raise RuntimeError('tensor must be batches of square matrices')
-    return _MatrixSquareRootEig.apply(a)
-
-
-@utils.tf32_mode(matmul=False)
-def fid(x, y, eps=1e-8):
-    x_mean = x.mean(dim=0)
-    y_mean = y.mean(dim=0)
-    mean_term = (x_mean - y_mean).pow(2).sum()
-    x_cov = torch.cov(x.T)
-    y_cov = torch.cov(y.T)
-    eps_eye = torch.eye(x_cov.shape[0], device=x_cov.device, dtype=x_cov.dtype) * eps
-    x_cov = x_cov + eps_eye
-    y_cov = y_cov + eps_eye
-    x_cov_sqrt = sqrtm_eig(x_cov)
-    cov_term = torch.trace(x_cov + y_cov - 2 * sqrtm_eig(x_cov_sqrt @ y_cov @ x_cov_sqrt))
-    return mean_term + cov_term
--- a/comfy/k_diffusion/external.py
+++ b/comfy/k_diffusion/external.py
@@ -1,179 +0,0 @@
-import math
-
-import torch
-from torch import nn
-
-from . import sampling, utils
-
-
-class VDenoiser(nn.Module):
-    """A v-diffusion-pytorch model wrapper for k-diffusion."""
-
-    def __init__(self, inner_model):
-        super().__init__()
-        self.inner_model = inner_model
-        self.sigma_data = 1.
-
-    def get_scalings(self, sigma):
-        c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2)
-        c_out = -sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
-        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
-        return c_skip, c_out, c_in
-
-    def sigma_to_t(self, sigma):
-        return sigma.atan() / math.pi * 2
-
-    def t_to_sigma(self, t):
-        return (t * math.pi / 2).tan()
-
-    def loss(self, input, noise, sigma, **kwargs):
-        c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        noised_input = input + noise * utils.append_dims(sigma, input.ndim)
-        model_output = self.inner_model(noised_input * c_in, self.sigma_to_t(sigma), **kwargs)
-        target = (input - c_skip * noised_input) / c_out
-        return (model_output - target).pow(2).flatten(1).mean(1)
-
-    def forward(self, input, sigma, **kwargs):
-        c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        return self.inner_model(input * c_in, self.sigma_to_t(sigma), **kwargs) * c_out + input * c_skip
-
-
-class DiscreteSchedule(nn.Module):
-    """A mapping between continuous noise levels (sigmas) and a list of discrete noise
-    levels."""
-
-    def __init__(self, sigmas, quantize):
-        super().__init__()
-        self.register_buffer('sigmas', sigmas)
-        self.register_buffer('log_sigmas', sigmas.log())
-        self.quantize = quantize
-
-    @property
-    def sigma_min(self):
-        return self.sigmas[0]
-
-    @property
-    def sigma_max(self):
-        return self.sigmas[-1]
-
-    def get_sigmas(self, n=None):
-        if n is None:
-            return sampling.append_zero(self.sigmas.flip(0))
-        t_max = len(self.sigmas) - 1
-        t = torch.linspace(t_max, 0, n, device=self.sigmas.device)
-        return sampling.append_zero(self.t_to_sigma(t))
-
-    def sigma_to_t(self, sigma, quantize=None):
-        quantize = self.quantize if quantize is None else quantize
-        log_sigma = sigma.log()
-        dists = log_sigma.to(self.log_sigmas.device) - self.log_sigmas[:, None]
-        if quantize:
-            return dists.abs().argmin(dim=0).view(sigma.shape)
-        low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
-        low, high = self.log_sigmas[low_idx], self.log_sigmas[high_idx]
-        w = (low - log_sigma) / (low - high)
-        w = w.clamp(0, 1)
-        t = (1 - w) * low_idx + w * high_idx
-        return t.view(sigma.shape)
-
-    def t_to_sigma(self, t):
-        t = t.float()
-        low_idx = t.floor().long()
-        high_idx = t.ceil().long()
-        w = t-low_idx if t.device.type == 'mps' else t.frac()
-        log_sigma = (1 - w) * self.log_sigmas[low_idx] + w * self.log_sigmas[high_idx]
-        return log_sigma.exp()
-
-
-class DiscreteEpsDDPMDenoiser(DiscreteSchedule):
-    """A wrapper for discrete schedule DDPM models that output eps (the predicted
-    noise)."""
-
-    def __init__(self, model, alphas_cumprod, quantize):
-        super().__init__(((1 - alphas_cumprod) / alphas_cumprod) ** 0.5, quantize)
-        self.inner_model = model
-        self.sigma_data = 1.
-
-    def get_scalings(self, sigma):
-        c_out = -sigma
-        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
-        return c_out, c_in
-
-    def get_eps(self, *args, **kwargs):
-        return self.inner_model(*args, **kwargs)
-
-    def loss(self, input, noise, sigma, **kwargs):
-        c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        noised_input = input + noise * utils.append_dims(sigma, input.ndim)
-        eps = self.get_eps(noised_input * c_in, self.sigma_to_t(sigma), **kwargs)
-        return (eps - noise).pow(2).flatten(1).mean(1)
-
-    def forward(self, input, sigma, **kwargs):
-        c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), **kwargs)
-        return input + eps * c_out
-
-
-class OpenAIDenoiser(DiscreteEpsDDPMDenoiser):
-    """A wrapper for OpenAI diffusion models."""
-
-    def __init__(self, model, diffusion, quantize=False, has_learned_sigmas=True, device='cpu'):
-        alphas_cumprod = torch.tensor(diffusion.alphas_cumprod, device=device, dtype=torch.float32)
-        super().__init__(model, alphas_cumprod, quantize=quantize)
-        self.has_learned_sigmas = has_learned_sigmas
-
-    def get_eps(self, *args, **kwargs):
-        model_output = self.inner_model(*args, **kwargs)
-        if self.has_learned_sigmas:
-            return model_output.chunk(2, dim=1)[0]
-        return model_output
-
-
-class CompVisDenoiser(DiscreteEpsDDPMDenoiser):
-    """A wrapper for CompVis diffusion models."""
-
-    def __init__(self, model, quantize=False, device='cpu'):
-        super().__init__(model, model.alphas_cumprod, quantize=quantize)
-
-    def get_eps(self, *args, **kwargs):
-        return self.inner_model.apply_model(*args, **kwargs)
-
-
-class DiscreteVDDPMDenoiser(DiscreteSchedule):
-    """A wrapper for discrete schedule DDPM models that output v."""
-
-    def __init__(self, model, alphas_cumprod, quantize):
-        super().__init__(((1 - alphas_cumprod) / alphas_cumprod) ** 0.5, quantize)
-        self.inner_model = model
-        self.sigma_data = 1.
-
-    def get_scalings(self, sigma):
-        c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2)
-        c_out = -sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
-        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
-        return c_skip, c_out, c_in
-
-    def get_v(self, *args, **kwargs):
-        return self.inner_model(*args, **kwargs)
-
-    def loss(self, input, noise, sigma, **kwargs):
-        c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        noised_input = input + noise * utils.append_dims(sigma, input.ndim)
-        model_output = self.get_v(noised_input * c_in, self.sigma_to_t(sigma), **kwargs)
-        target = (input - c_skip * noised_input) / c_out
-        return (model_output - target).pow(2).flatten(1).mean(1)
-
-    def forward(self, input, sigma, **kwargs):
-        c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        return self.get_v(input * c_in, self.sigma_to_t(sigma), **kwargs) * c_out + input * c_skip
-
-
-class CompVisVDenoiser(DiscreteVDDPMDenoiser):
-    """A wrapper for CompVis diffusion models that output v."""
-
-    def __init__(self, model, quantize=False, device='cpu'):
-        super().__init__(model, model.alphas_cumprod, quantize=quantize)
-
-    def get_v(self, x, t, cond, **kwargs):
-        return self.inner_model.apply_model(x, t, cond)
--- a/comfy/k_diffusion/gns.py
+++ b/comfy/k_diffusion/gns.py
@@ -1,99 +0,0 @@
-import torch
-from torch import nn
-
-
-class DDPGradientStatsHook:
-    def __init__(self, ddp_module):
-        try:
-            ddp_module.register_comm_hook(self, self._hook_fn)
-        except AttributeError:
-            raise ValueError('DDPGradientStatsHook does not support non-DDP wrapped modules')
-        self._clear_state()
-
-    def _clear_state(self):
-        self.bucket_sq_norms_small_batch = []
-        self.bucket_sq_norms_large_batch = []
-
-    @staticmethod
-    def _hook_fn(self, bucket):
-        buf = bucket.buffer()
-        self.bucket_sq_norms_small_batch.append(buf.pow(2).sum())
-        fut = torch.distributed.all_reduce(buf, op=torch.distributed.ReduceOp.AVG, async_op=True).get_future()
-        def callback(fut):
-            buf = fut.value()[0]
-            self.bucket_sq_norms_large_batch.append(buf.pow(2).sum())
-            return buf
-        return fut.then(callback)
-
-    def get_stats(self):
-        sq_norm_small_batch = sum(self.bucket_sq_norms_small_batch)
-        sq_norm_large_batch = sum(self.bucket_sq_norms_large_batch)
-        self._clear_state()
-        stats = torch.stack([sq_norm_small_batch, sq_norm_large_batch])
-        torch.distributed.all_reduce(stats, op=torch.distributed.ReduceOp.AVG)
-        return stats[0].item(), stats[1].item()
-
-
-class GradientNoiseScale:
-    """Calculates the gradient noise scale (1 / SNR), or critical batch size,
-    from _An Empirical Model of Large-Batch Training_,
-    https://arxiv.org/abs/1812.06162).
-
-    Args:
-        beta (float): The decay factor for the exponential moving averages used to
-            calculate the gradient noise scale.
-            Default: 0.9998
-        eps (float): Added for numerical stability.
-            Default: 1e-8
-    """
-
-    def __init__(self, beta=0.9998, eps=1e-8):
-        self.beta = beta
-        self.eps = eps
-        self.ema_sq_norm = 0.
-        self.ema_var = 0.
-        self.beta_cumprod = 1.
-        self.gradient_noise_scale = float('nan')
-
-    def state_dict(self):
-        """Returns the state of the object as a :class:`dict`."""
-        return dict(self.__dict__.items())
-
-    def load_state_dict(self, state_dict):
-        """Loads the object's state.
-        Args:
-            state_dict (dict): object state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-
-    def update(self, sq_norm_small_batch, sq_norm_large_batch, n_small_batch, n_large_batch):
-        """Updates the state with a new batch's gradient statistics, and returns the
-        current gradient noise scale.
-
-        Args:
-            sq_norm_small_batch (float): The mean of the squared 2-norms of microbatch or
-                per sample gradients.
-            sq_norm_large_batch (float): The squared 2-norm of the mean of the microbatch or
-                per sample gradients.
-            n_small_batch (int): The batch size of the individual microbatch or per sample
-                gradients (1 if per sample).
-            n_large_batch (int): The total batch size of the mean of the microbatch or
-                per sample gradients.
-        """
-        est_sq_norm = (n_large_batch * sq_norm_large_batch - n_small_batch * sq_norm_small_batch) / (n_large_batch - n_small_batch)
-        est_var = (sq_norm_small_batch - sq_norm_large_batch) / (1 / n_small_batch - 1 / n_large_batch)
-        self.ema_sq_norm = self.beta * self.ema_sq_norm + (1 - self.beta) * est_sq_norm
-        self.ema_var = self.beta * self.ema_var + (1 - self.beta) * est_var
-        self.beta_cumprod *= self.beta
-        self.gradient_noise_scale = max(self.ema_var, self.eps) / max(self.ema_sq_norm, self.eps)
-        return self.gradient_noise_scale
-
-    def get_gns(self):
-        """Returns the current gradient noise scale."""
-        return self.gradient_noise_scale
-
-    def get_stats(self):
-        """Returns the current (debiased) estimates of the squared mean gradient
-        and gradient variance."""
-        return self.ema_sq_norm / (1 - self.beta_cumprod), self.ema_var / (1 - self.beta_cumprod)
--- a/comfy/k_diffusion/layers.py
+++ b/comfy/k_diffusion/layers.py
@@ -1,246 +0,0 @@
-import math
-
-from einops import rearrange, repeat
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from . import utils
-
-# Karras et al. preconditioned denoiser
-
-class Denoiser(nn.Module):
-    """A Karras et al. preconditioner for denoising diffusion models."""
-
-    def __init__(self, inner_model, sigma_data=1.):
-        super().__init__()
-        self.inner_model = inner_model
-        self.sigma_data = sigma_data
-
-    def get_scalings(self, sigma):
-        c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2)
-        c_out = sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
-        c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
-        return c_skip, c_out, c_in
-
-    def loss(self, input, noise, sigma, **kwargs):
-        c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        noised_input = input + noise * utils.append_dims(sigma, input.ndim)
-        model_output = self.inner_model(noised_input * c_in, sigma, **kwargs)
-        target = (input - c_skip * noised_input) / c_out
-        return (model_output - target).pow(2).flatten(1).mean(1)
-
-    def forward(self, input, sigma, **kwargs):
-        c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        return self.inner_model(input * c_in, sigma, **kwargs) * c_out + input * c_skip
-
-
-class DenoiserWithVariance(Denoiser):
-    def loss(self, input, noise, sigma, **kwargs):
-        c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)]
-        noised_input = input + noise * utils.append_dims(sigma, input.ndim)
-        model_output, logvar = self.inner_model(noised_input * c_in, sigma, return_variance=True, **kwargs)
-        logvar = utils.append_dims(logvar, model_output.ndim)
-        target = (input - c_skip * noised_input) / c_out
-        losses = ((model_output - target) ** 2 / logvar.exp() + logvar) / 2
-        return losses.flatten(1).mean(1)
-
-
-# Residual blocks
-
-class ResidualBlock(nn.Module):
-    def __init__(self, *main, skip=None):
-        super().__init__()
-        self.main = nn.Sequential(*main)
-        self.skip = skip if skip else nn.Identity()
-
-    def forward(self, input):
-        return self.main(input) + self.skip(input)
-
-
-# Noise level (and other) conditioning
-
-class ConditionedModule(nn.Module):
-    pass
-
-
-class UnconditionedModule(ConditionedModule):
-    def __init__(self, module):
-        super().__init__()
-        self.module = module
-
-    def forward(self, input, cond=None):
-        return self.module(input)
-
-
-class ConditionedSequential(nn.Sequential, ConditionedModule):
-    def forward(self, input, cond):
-        for module in self:
-            if isinstance(module, ConditionedModule):
-                input = module(input, cond)
-            else:
-                input = module(input)
-        return input
-
-
-class ConditionedResidualBlock(ConditionedModule):
-    def __init__(self, *main, skip=None):
-        super().__init__()
-        self.main = ConditionedSequential(*main)
-        self.skip = skip if skip else nn.Identity()
-
-    def forward(self, input, cond):
-        skip = self.skip(input, cond) if isinstance(self.skip, ConditionedModule) else self.skip(input)
-        return self.main(input, cond) + skip
-
-
-class AdaGN(ConditionedModule):
-    def __init__(self, feats_in, c_out, num_groups, eps=1e-5, cond_key='cond'):
-        super().__init__()
-        self.num_groups = num_groups
-        self.eps = eps
-        self.cond_key = cond_key
-        self.mapper = nn.Linear(feats_in, c_out * 2)
-
-    def forward(self, input, cond):
-        weight, bias = self.mapper(cond[self.cond_key]).chunk(2, dim=-1)
-        input = F.group_norm(input, self.num_groups, eps=self.eps)
-        return torch.addcmul(utils.append_dims(bias, input.ndim), input, utils.append_dims(weight, input.ndim) + 1)
-
-
-# Attention
-
-class SelfAttention2d(ConditionedModule):
-    def __init__(self, c_in, n_head, norm, dropout_rate=0.):
-        super().__init__()
-        assert c_in % n_head == 0
-        self.norm_in = norm(c_in)
-        self.n_head = n_head
-        self.qkv_proj = nn.Conv2d(c_in, c_in * 3, 1)
-        self.out_proj = nn.Conv2d(c_in, c_in, 1)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(self, input, cond):
-        n, c, h, w = input.shape
-        qkv = self.qkv_proj(self.norm_in(input, cond))
-        qkv = qkv.view([n, self.n_head * 3, c // self.n_head, h * w]).transpose(2, 3)
-        q, k, v = qkv.chunk(3, dim=1)
-        scale = k.shape[3] ** -0.25
-        att = ((q * scale) @ (k.transpose(2, 3) * scale)).softmax(3)
-        att = self.dropout(att)
-        y = (att @ v).transpose(2, 3).contiguous().view([n, c, h, w])
-        return input + self.out_proj(y)
-
-
-class CrossAttention2d(ConditionedModule):
-    def __init__(self, c_dec, c_enc, n_head, norm_dec, dropout_rate=0.,
-                 cond_key='cross', cond_key_padding='cross_padding'):
-        super().__init__()
-        assert c_dec % n_head == 0
-        self.cond_key = cond_key
-        self.cond_key_padding = cond_key_padding
-        self.norm_enc = nn.LayerNorm(c_enc)
-        self.norm_dec = norm_dec(c_dec)
-        self.n_head = n_head
-        self.q_proj = nn.Conv2d(c_dec, c_dec, 1)
-        self.kv_proj = nn.Linear(c_enc, c_dec * 2)
-        self.out_proj = nn.Conv2d(c_dec, c_dec, 1)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(self, input, cond):
-        n, c, h, w = input.shape
-        q = self.q_proj(self.norm_dec(input, cond))
-        q = q.view([n, self.n_head, c // self.n_head, h * w]).transpose(2, 3)
-        kv = self.kv_proj(self.norm_enc(cond[self.cond_key]))
-        kv = kv.view([n, -1, self.n_head * 2, c // self.n_head]).transpose(1, 2)
-        k, v = kv.chunk(2, dim=1)
-        scale = k.shape[3] ** -0.25
-        att = ((q * scale) @ (k.transpose(2, 3) * scale))
-        att = att - (cond[self.cond_key_padding][:, None, None, :]) * 10000
-        att = att.softmax(3)
-        att = self.dropout(att)
-        y = (att @ v).transpose(2, 3)
-        y = y.contiguous().view([n, c, h, w])
-        return input + self.out_proj(y)
-
-
-# Downsampling/upsampling
-
-_kernels = {
-    'linear':
-        [1 / 8, 3 / 8, 3 / 8, 1 / 8],
-    'cubic': 
-        [-0.01171875, -0.03515625, 0.11328125, 0.43359375,
-        0.43359375, 0.11328125, -0.03515625, -0.01171875],
-    'lanczos3': 
-        [0.003689131001010537, 0.015056144446134567, -0.03399861603975296,
-        -0.066637322306633, 0.13550527393817902, 0.44638532400131226,
-        0.44638532400131226, 0.13550527393817902, -0.066637322306633,
-        -0.03399861603975296, 0.015056144446134567, 0.003689131001010537]
-}
-_kernels['bilinear'] = _kernels['linear']
-_kernels['bicubic'] = _kernels['cubic']
-
-
-class Downsample2d(nn.Module):
-    def __init__(self, kernel='linear', pad_mode='reflect'):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = torch.tensor([_kernels[kernel]])
-        self.pad = kernel_1d.shape[1] // 2 - 1
-        self.register_buffer('kernel', kernel_1d.T @ kernel_1d)
-
-    def forward(self, x):
-        x = F.pad(x, (self.pad,) * 4, self.pad_mode)
-        weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
-        indices = torch.arange(x.shape[1], device=x.device)
-        weight[indices, indices] = self.kernel.to(weight)
-        return F.conv2d(x, weight, stride=2)
-
-
-class Upsample2d(nn.Module):
-    def __init__(self, kernel='linear', pad_mode='reflect'):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = torch.tensor([_kernels[kernel]]) * 2
-        self.pad = kernel_1d.shape[1] // 2 - 1
-        self.register_buffer('kernel', kernel_1d.T @ kernel_1d)
-
-    def forward(self, x):
-        x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode)
-        weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]])
-        indices = torch.arange(x.shape[1], device=x.device)
-        weight[indices, indices] = self.kernel.to(weight)
-        return F.conv_transpose2d(x, weight, stride=2, padding=self.pad * 2 + 1)
-
-
-# Embeddings
-
-class FourierFeatures(nn.Module):
-    def __init__(self, in_features, out_features, std=1.):
-        super().__init__()
-        assert out_features % 2 == 0
-        self.register_buffer('weight', torch.randn([out_features // 2, in_features]) * std)
-
-    def forward(self, input):
-        f = 2 * math.pi * input @ self.weight.T
-        return torch.cat([f.cos(), f.sin()], dim=-1)
-
-
-# U-Nets
-
-class UNet(ConditionedModule):
-    def __init__(self, d_blocks, u_blocks, skip_stages=0):
-        super().__init__()
-        self.d_blocks = nn.ModuleList(d_blocks)
-        self.u_blocks = nn.ModuleList(u_blocks)
-        self.skip_stages = skip_stages
-
-    def forward(self, input, cond):
-        skips = []
-        for block in self.d_blocks[self.skip_stages:]:
-            input = block(input, cond)
-            skips.append(input)
-        for i, (block, skip) in enumerate(zip(self.u_blocks, reversed(skips))):
-            input = block(input, cond, skip if i > 0 else None)
-        return input
--- a/comfy/k_diffusion/models/init.py
+++ b/comfy/k_diffusion/models/init.py
@@ -1 +0,0 @@
-from .image_v1 import ImageDenoiserModelV1
--- a/comfy/k_diffusion/models/image_v1.py
+++ b/comfy/k_diffusion/models/image_v1.py
@@ -1,156 +0,0 @@
-import math
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from .. import layers, utils
-
-
-def orthogonal_(module):
-    nn.init.orthogonal_(module.weight)
-    return module
-
-
-class ResConvBlock(layers.ConditionedResidualBlock):
-    def __init__(self, feats_in, c_in, c_mid, c_out, group_size=32, dropout_rate=0.):
-        skip = None if c_in == c_out else orthogonal_(nn.Conv2d(c_in, c_out, 1, bias=False))
-        super().__init__(
-            layers.AdaGN(feats_in, c_in, max(1, c_in // group_size)),
-            nn.GELU(),
-            nn.Conv2d(c_in, c_mid, 3, padding=1),
-            nn.Dropout2d(dropout_rate, inplace=True),
-            layers.AdaGN(feats_in, c_mid, max(1, c_mid // group_size)),
-            nn.GELU(),
-            nn.Conv2d(c_mid, c_out, 3, padding=1),
-            nn.Dropout2d(dropout_rate, inplace=True),
-            skip=skip)
-
-
-class DBlock(layers.ConditionedSequential):
-    def __init__(self, n_layers, feats_in, c_in, c_mid, c_out, group_size=32, head_size=64, dropout_rate=0., downsample=False, self_attn=False, cross_attn=False, c_enc=0):
-        modules = [nn.Identity()]
-        for i in range(n_layers):
-            my_c_in = c_in if i == 0 else c_mid
-            my_c_out = c_mid if i < n_layers - 1 else c_out
-            modules.append(ResConvBlock(feats_in, my_c_in, c_mid, my_c_out, group_size, dropout_rate))
-            if self_attn:
-                norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size))
-                modules.append(layers.SelfAttention2d(my_c_out, max(1, my_c_out // head_size), norm, dropout_rate))
-            if cross_attn:
-                norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size))
-                modules.append(layers.CrossAttention2d(my_c_out, c_enc, max(1, my_c_out // head_size), norm, dropout_rate))
-        super().__init__(*modules)
-        self.set_downsample(downsample)
-
-    def set_downsample(self, downsample):
-        self[0] = layers.Downsample2d() if downsample else nn.Identity()
-        return self
-
-
-class UBlock(layers.ConditionedSequential):
-    def __init__(self, n_layers, feats_in, c_in, c_mid, c_out, group_size=32, head_size=64, dropout_rate=0., upsample=False, self_attn=False, cross_attn=False, c_enc=0):
-        modules = []
-        for i in range(n_layers):
-            my_c_in = c_in if i == 0 else c_mid
-            my_c_out = c_mid if i < n_layers - 1 else c_out
-            modules.append(ResConvBlock(feats_in, my_c_in, c_mid, my_c_out, group_size, dropout_rate))
-            if self_attn:
-                norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size))
-                modules.append(layers.SelfAttention2d(my_c_out, max(1, my_c_out // head_size), norm, dropout_rate))
-            if cross_attn:
-                norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size))
-                modules.append(layers.CrossAttention2d(my_c_out, c_enc, max(1, my_c_out // head_size), norm, dropout_rate))
-        modules.append(nn.Identity())
-        super().__init__(*modules)
-        self.set_upsample(upsample)
-
-    def forward(self, input, cond, skip=None):
-        if skip is not None:
-            input = torch.cat([input, skip], dim=1)
-        return super().forward(input, cond)
-
-    def set_upsample(self, upsample):
-        self[-1] = layers.Upsample2d() if upsample else nn.Identity()
-        return self
-
-
-class MappingNet(nn.Sequential):
-    def __init__(self, feats_in, feats_out, n_layers=2):
-        layers = []
-        for i in range(n_layers):
-            layers.append(orthogonal_(nn.Linear(feats_in if i == 0 else feats_out, feats_out)))
-            layers.append(nn.GELU())
-        super().__init__(*layers)
-
-
-class ImageDenoiserModelV1(nn.Module):
-    def __init__(self, c_in, feats_in, depths, channels, self_attn_depths, cross_attn_depths=None, mapping_cond_dim=0, unet_cond_dim=0, cross_cond_dim=0, dropout_rate=0., patch_size=1, skip_stages=0, has_variance=False):
-        super().__init__()
-        self.c_in = c_in
-        self.channels = channels
-        self.unet_cond_dim = unet_cond_dim
-        self.patch_size = patch_size
-        self.has_variance = has_variance
-        self.timestep_embed = layers.FourierFeatures(1, feats_in)
-        if mapping_cond_dim > 0:
-            self.mapping_cond = nn.Linear(mapping_cond_dim, feats_in, bias=False)
-        self.mapping = MappingNet(feats_in, feats_in)
-        self.proj_in = nn.Conv2d((c_in + unet_cond_dim) * self.patch_size ** 2, channels[max(0, skip_stages - 1)], 1)
-        self.proj_out = nn.Conv2d(channels[max(0, skip_stages - 1)], c_in * self.patch_size ** 2 + (1 if self.has_variance else 0), 1)
-        nn.init.zeros_(self.proj_out.weight)
-        nn.init.zeros_(self.proj_out.bias)
-        if cross_cond_dim == 0:
-            cross_attn_depths = [False] * len(self_attn_depths)
-        d_blocks, u_blocks = [], []
-        for i in range(len(depths)):
-            my_c_in = channels[max(0, i - 1)]
-            d_blocks.append(DBlock(depths[i], feats_in, my_c_in, channels[i], channels[i], downsample=i > skip_stages, self_attn=self_attn_depths[i], cross_attn=cross_attn_depths[i], c_enc=cross_cond_dim, dropout_rate=dropout_rate))
-        for i in range(len(depths)):
-            my_c_in = channels[i] * 2 if i < len(depths) - 1 else channels[i]
-            my_c_out = channels[max(0, i - 1)]
-            u_blocks.append(UBlock(depths[i], feats_in, my_c_in, channels[i], my_c_out, upsample=i > skip_stages, self_attn=self_attn_depths[i], cross_attn=cross_attn_depths[i], c_enc=cross_cond_dim, dropout_rate=dropout_rate))
-        self.u_net = layers.UNet(d_blocks, reversed(u_blocks), skip_stages=skip_stages)
-
-    def forward(self, input, sigma, mapping_cond=None, unet_cond=None, cross_cond=None, cross_cond_padding=None, return_variance=False):
-        c_noise = sigma.log() / 4
-        timestep_embed = self.timestep_embed(utils.append_dims(c_noise, 2))
-        mapping_cond_embed = torch.zeros_like(timestep_embed) if mapping_cond is None else self.mapping_cond(mapping_cond)
-        mapping_out = self.mapping(timestep_embed + mapping_cond_embed)
-        cond = {'cond': mapping_out}
-        if unet_cond is not None:
-            input = torch.cat([input, unet_cond], dim=1)
-        if cross_cond is not None:
-            cond['cross'] = cross_cond
-            cond['cross_padding'] = cross_cond_padding
-        if self.patch_size > 1:
-            input = F.pixel_unshuffle(input, self.patch_size)
-        input = self.proj_in(input)
-        input = self.u_net(input, cond)
-        input = self.proj_out(input)
-        if self.has_variance:
-            input, logvar = input[:, :-1], input[:, -1].flatten(1).mean(1)
-        if self.patch_size > 1:
-            input = F.pixel_shuffle(input, self.patch_size)
-        if self.has_variance and return_variance:
-            return input, logvar
-        return input
-
-    def set_skip_stages(self, skip_stages):
-        self.proj_in = nn.Conv2d(self.proj_in.in_channels, self.channels[max(0, skip_stages - 1)], 1)
-        self.proj_out = nn.Conv2d(self.channels[max(0, skip_stages - 1)], self.proj_out.out_channels, 1)
-        nn.init.zeros_(self.proj_out.weight)
-        nn.init.zeros_(self.proj_out.bias)
-        self.u_net.skip_stages = skip_stages
-        for i, block in enumerate(self.u_net.d_blocks):
-            block.set_downsample(i > skip_stages)
-        for i, block in enumerate(reversed(self.u_net.u_blocks)):
-            block.set_upsample(i > skip_stages)
-        return self
-
-    def set_patch_size(self, patch_size):
-        self.patch_size = patch_size
-        self.proj_in = nn.Conv2d((self.c_in + self.unet_cond_dim) * self.patch_size ** 2, self.channels[max(0, self.u_net.skip_stages - 1)], 1)
-        self.proj_out = nn.Conv2d(self.channels[max(0, self.u_net.skip_stages - 1)], self.c_in * self.patch_size ** 2 + (1 if self.has_variance else 0), 1)
-        nn.init.zeros_(self.proj_out.weight)
-        nn.init.zeros_(self.proj_out.bias)
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
--- a/comfy/k_diffusion/utils.py
+++ b/comfy/k_diffusion/utils.py
@@ -10,25 +10,6 @@ from PIL import Image
 import torch
 from torch import nn, optim
 from torch.utils import data
-from torchvision.transforms import functional as TF
-
-
-def from_pil_image(x):
-    """Converts from a PIL image to a tensor."""
-    x = TF.to_tensor(x)
-    if x.ndim == 2:
-        x = x[..., None]
-    return x * 2 - 1
-
-
-def to_pil_image(x):
-    """Converts from a tensor to a PIL image."""
-    if x.ndim == 4:
-        assert x.shape[0] == 1
-        x = x[0]
-    if x.shape[0] == 1:
-        x = x[0]
-    return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2)


 def hf_datasets_augs_helper(examples, transform, image_key, mode='RGB'):
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -0,0 +1,472 @@
+import torch
+
+class LatentFormat:
+    scale_factor = 1.0
+    latent_channels = 4
+    latent_dimensions = 2
+    latent_rgb_factors = None
+    latent_rgb_factors_bias = None
+    taesd_decoder_name = None
+
+    def process_in(self, latent):
+        return latent * self.scale_factor
+
+    def process_out(self, latent):
+        return latent / self.scale_factor
+
+class SD15(LatentFormat):
+    def __init__(self, scale_factor=0.18215):
+        self.scale_factor = scale_factor
+        self.latent_rgb_factors = [
+                    #   R        G        B
+                    [ 0.3512,  0.2297,  0.3227],
+                    [ 0.3250,  0.4974,  0.2350],
+                    [-0.2829,  0.1762,  0.2721],
+                    [-0.2120, -0.2616, -0.7177]
+                ]
+        self.taesd_decoder_name = "taesd_decoder"
+
+class SDXL(LatentFormat):
+    scale_factor = 0.13025
+
+    def __init__(self):
+        self.latent_rgb_factors = [
+                    #   R        G        B
+                    [ 0.3651,  0.4232,  0.4341],
+                    [-0.2533, -0.0042,  0.1068],
+                    [ 0.1076,  0.1111, -0.0362],
+                    [-0.3165, -0.2492, -0.2188]
+                ]
+        self.latent_rgb_factors_bias = [ 0.1084, -0.0175, -0.0011]
+
+        self.taesd_decoder_name = "taesdxl_decoder"
+
+class SDXL_Playground_2_5(LatentFormat):
+    def __init__(self):
+        self.scale_factor = 0.5
+        self.latents_mean = torch.tensor([-1.6574, 1.886, -1.383, 2.5155]).view(1, 4, 1, 1)
+        self.latents_std = torch.tensor([8.4927, 5.9022, 6.5498, 5.2299]).view(1, 4, 1, 1)
+
+        self.latent_rgb_factors = [
+                    #   R        G        B
+                    [ 0.3920,  0.4054,  0.4549],
+                    [-0.2634, -0.0196,  0.0653],
+                    [ 0.0568,  0.1687, -0.0755],
+                    [-0.3112, -0.2359, -0.2076]
+                ]
+        self.taesd_decoder_name = "taesdxl_decoder"
+
+    def process_in(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return (latent - latents_mean) * self.scale_factor / latents_std
+
+    def process_out(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return latent * latents_std / self.scale_factor + latents_mean
+
+
+class SD_X4(LatentFormat):
+    def __init__(self):
+        self.scale_factor = 0.08333
+        self.latent_rgb_factors = [
+            [-0.2340, -0.3863, -0.3257],
+            [ 0.0994,  0.0885, -0.0908],
+            [-0.2833, -0.2349, -0.3741],
+            [ 0.2523, -0.0055, -0.1651]
+        ]
+
+class SC_Prior(LatentFormat):
+    latent_channels = 16
+    def __init__(self):
+        self.scale_factor = 1.0
+        self.latent_rgb_factors = [
+            [-0.0326, -0.0204, -0.0127],
+            [-0.1592, -0.0427,  0.0216],
+            [ 0.0873,  0.0638, -0.0020],
+            [-0.0602,  0.0442,  0.1304],
+            [ 0.0800, -0.0313, -0.1796],
+            [-0.0810, -0.0638, -0.1581],
+            [ 0.1791,  0.1180,  0.0967],
+            [ 0.0740,  0.1416,  0.0432],
+            [-0.1745, -0.1888, -0.1373],
+            [ 0.2412,  0.1577,  0.0928],
+            [ 0.1908,  0.0998,  0.0682],
+            [ 0.0209,  0.0365, -0.0092],
+            [ 0.0448, -0.0650, -0.1728],
+            [-0.1658, -0.1045, -0.1308],
+            [ 0.0542,  0.1545,  0.1325],
+            [-0.0352, -0.1672, -0.2541]
+        ]
+
+class SC_B(LatentFormat):
+    def __init__(self):
+        self.scale_factor = 1.0 / 0.43
+        self.latent_rgb_factors = [
+            [ 0.1121,  0.2006,  0.1023],
+            [-0.2093, -0.0222, -0.0195],
+            [-0.3087, -0.1535,  0.0366],
+            [ 0.0290, -0.1574, -0.4078]
+        ]
+
+class SD3(LatentFormat):
+    latent_channels = 16
+    def __init__(self):
+        self.scale_factor = 1.5305
+        self.shift_factor = 0.0609
+        self.latent_rgb_factors = [
+            [-0.0922, -0.0175,  0.0749],
+            [ 0.0311,  0.0633,  0.0954],
+            [ 0.1994,  0.0927,  0.0458],
+            [ 0.0856,  0.0339,  0.0902],
+            [ 0.0587,  0.0272, -0.0496],
+            [-0.0006,  0.1104,  0.0309],
+            [ 0.0978,  0.0306,  0.0427],
+            [-0.0042,  0.1038,  0.1358],
+            [-0.0194,  0.0020,  0.0669],
+            [-0.0488,  0.0130, -0.0268],
+            [ 0.0922,  0.0988,  0.0951],
+            [-0.0278,  0.0524, -0.0542],
+            [ 0.0332,  0.0456,  0.0895],
+            [-0.0069, -0.0030, -0.0810],
+            [-0.0596, -0.0465, -0.0293],
+            [-0.1448, -0.1463, -0.1189]
+        ]
+        self.latent_rgb_factors_bias = [0.2394, 0.2135, 0.1925]
+        self.taesd_decoder_name = "taesd3_decoder"
+
+    def process_in(self, latent):
+        return (latent - self.shift_factor) * self.scale_factor
+
+    def process_out(self, latent):
+        return (latent / self.scale_factor) + self.shift_factor
+
+class StableAudio1(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 1
+
+class Flux(SD3):
+    latent_channels = 16
+    def __init__(self):
+        self.scale_factor = 0.3611
+        self.shift_factor = 0.1159
+        self.latent_rgb_factors =[
+            [-0.0346,  0.0244,  0.0681],
+            [ 0.0034,  0.0210,  0.0687],
+            [ 0.0275, -0.0668, -0.0433],
+            [-0.0174,  0.0160,  0.0617],
+            [ 0.0859,  0.0721,  0.0329],
+            [ 0.0004,  0.0383,  0.0115],
+            [ 0.0405,  0.0861,  0.0915],
+            [-0.0236, -0.0185, -0.0259],
+            [-0.0245,  0.0250,  0.1180],
+            [ 0.1008,  0.0755, -0.0421],
+            [-0.0515,  0.0201,  0.0011],
+            [ 0.0428, -0.0012, -0.0036],
+            [ 0.0817,  0.0765,  0.0749],
+            [-0.1264, -0.0522, -0.1103],
+            [-0.0280, -0.0881, -0.0499],
+            [-0.1262, -0.0982, -0.0778]
+        ]
+        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
+        self.taesd_decoder_name = "taef1_decoder"
+
+    def process_in(self, latent):
+        return (latent - self.shift_factor) * self.scale_factor
+
+    def process_out(self, latent):
+        return (latent / self.scale_factor) + self.shift_factor
+
+class Mochi(LatentFormat):
+    latent_channels = 12
+    latent_dimensions = 3
+
+    def __init__(self):
+        self.scale_factor = 1.0
+        self.latents_mean = torch.tensor([-0.06730895953510081, -0.038011381506090416, -0.07477820912866141,
+                                          -0.05565264470995561, 0.012767231469026969, -0.04703542746246419,
+                                          0.043896967884726704, -0.09346305707025976, -0.09918314763016893,
+                                          -0.008729793427399178, -0.011931556316503654, -0.0321993391887285]).view(1, self.latent_channels, 1, 1, 1)
+        self.latents_std = torch.tensor([0.9263795028493863, 0.9248894543193766, 0.9393059390890617,
+                                         0.959253732819592, 0.8244560132752793, 0.917259975397747,
+                                         0.9294154431013696, 1.3720942357788521, 0.881393668867029,
+                                         0.9168315692124348, 0.9185249279345552, 0.9274757570805041]).view(1, self.latent_channels, 1, 1, 1)
+
+        self.latent_rgb_factors =[
+            [-0.0069, -0.0045,  0.0018],
+            [ 0.0154, -0.0692, -0.0274],
+            [ 0.0333,  0.0019,  0.0206],
+            [-0.1390,  0.0628,  0.1678],
+            [-0.0725,  0.0134, -0.1898],
+            [ 0.0074, -0.0270, -0.0209],
+            [-0.0176, -0.0277, -0.0221],
+            [ 0.5294,  0.5204,  0.3852],
+            [-0.0326, -0.0446, -0.0143],
+            [-0.0659,  0.0153, -0.0153],
+            [ 0.0185, -0.0217,  0.0014],
+            [-0.0396, -0.0495, -0.0281]
+        ]
+        self.latent_rgb_factors_bias = [-0.0940, -0.1418, -0.1453]
+        self.taesd_decoder_name = None #TODO
+
+    def process_in(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return (latent - latents_mean) * self.scale_factor / latents_std
+
+    def process_out(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return latent * latents_std / self.scale_factor + latents_mean
+
+class LTXV(LatentFormat):
+    latent_channels = 128
+    latent_dimensions = 3
+
+    def __init__(self):
+        self.latent_rgb_factors = [
+            [ 1.1202e-02, -6.3815e-04, -1.0021e-02],
+            [ 8.6031e-02,  6.5813e-02,  9.5409e-04],
+            [-1.2576e-02, -7.5734e-03, -4.0528e-03],
+            [ 9.4063e-03, -2.1688e-03,  2.6093e-03],
+            [ 3.7636e-03,  1.2765e-02,  9.1548e-03],
+            [ 2.1024e-02, -5.2973e-03,  3.4373e-03],
+            [-8.8896e-03, -1.9703e-02, -1.8761e-02],
+            [-1.3160e-02, -1.0523e-02,  1.9709e-03],
+            [-1.5152e-03, -6.9891e-03, -7.5810e-03],
+            [-1.7247e-03,  4.6560e-04, -3.3839e-03],
+            [ 1.3617e-02,  4.7077e-03, -2.0045e-03],
+            [ 1.0256e-02,  7.7318e-03,  1.3948e-02],
+            [-1.6108e-02, -6.2151e-03,  1.1561e-03],
+            [ 7.3407e-03,  1.5628e-02,  4.4865e-04],
+            [ 9.5357e-04, -2.9518e-03, -1.4760e-02],
+            [ 1.9143e-02,  1.0868e-02,  1.2264e-02],
+            [ 4.4575e-03,  3.6682e-05, -6.8508e-03],
+            [-4.5681e-04,  3.2570e-03,  7.7929e-03],
+            [ 3.3902e-02,  3.3405e-02,  3.7454e-02],
+            [-2.3001e-02, -2.4877e-03, -3.1033e-03],
+            [ 5.0265e-02,  3.8841e-02,  3.3539e-02],
+            [-4.1018e-03, -1.1095e-03,  1.5859e-03],
+            [-1.2689e-01, -1.3107e-01, -2.1005e-01],
+            [ 2.6276e-02,  1.4189e-02, -3.5963e-03],
+            [-4.8679e-03,  8.8486e-03,  7.8029e-03],
+            [-1.6610e-03, -4.8597e-03, -5.2060e-03],
+            [-2.1010e-03,  2.3610e-03,  9.3796e-03],
+            [-2.2482e-02, -2.1305e-02, -1.5087e-02],
+            [-1.5753e-02, -1.0646e-02, -6.5083e-03],
+            [-4.6975e-03,  5.0288e-03, -6.7390e-03],
+            [ 1.1951e-02,  2.0712e-02,  1.6191e-02],
+            [-6.3704e-03, -8.4827e-03, -9.5483e-03],
+            [ 7.2610e-03, -9.9326e-03, -2.2978e-02],
+            [-9.1904e-04,  6.2882e-03,  9.5720e-03],
+            [-3.7178e-02, -3.7123e-02, -5.6713e-02],
+            [-1.3373e-01, -1.0720e-01, -5.3801e-02],
+            [-5.3702e-03,  8.1256e-03,  8.8397e-03],
+            [-1.5247e-01, -2.1437e-01, -2.1843e-01],
+            [ 3.1441e-02,  7.0335e-03, -9.7541e-03],
+            [ 2.1528e-03, -8.9817e-03, -2.1023e-02],
+            [ 3.8461e-03, -5.8957e-03, -1.5014e-02],
+            [-4.3470e-03, -1.2940e-02, -1.5972e-02],
+            [-5.4781e-03, -1.0842e-02, -3.0204e-03],
+            [-6.5347e-03,  3.0806e-03, -1.0163e-02],
+            [-5.0414e-03, -7.1503e-03, -8.9686e-04],
+            [-8.5851e-03, -2.4351e-03,  1.0674e-03],
+            [-9.0016e-03, -9.6493e-03,  1.5692e-03],
+            [ 5.0914e-03,  1.2099e-02,  1.9968e-02],
+            [ 1.3758e-02,  1.1669e-02,  8.1958e-03],
+            [-1.0518e-02, -1.1575e-02, -4.1307e-03],
+            [-2.8410e-02, -3.1266e-02, -2.2149e-02],
+            [ 2.9336e-03,  3.6511e-02,  1.8717e-02],
+            [-1.6703e-02, -1.6696e-02, -4.4529e-03],
+            [ 4.8818e-02,  4.0063e-02,  8.7410e-03],
+            [-1.5066e-02, -5.7328e-04,  2.9785e-03],
+            [-1.7613e-02, -8.1034e-03,  1.3086e-02],
+            [-9.2633e-03,  1.0803e-02, -6.3489e-03],
+            [ 3.0851e-03,  4.7750e-04,  1.2347e-02],
+            [-2.2785e-02, -2.3043e-02, -2.6005e-02],
+            [-2.4787e-02, -1.5389e-02, -2.2104e-02],
+            [-2.3572e-02,  1.0544e-03,  1.2361e-02],
+            [-7.8915e-03, -1.2271e-03, -6.0968e-03],
+            [-1.1478e-02, -1.2543e-03,  6.2679e-03],
+            [-5.4229e-02,  2.6644e-02,  6.3394e-03],
+            [ 4.4216e-03, -7.3338e-03, -1.0464e-02],
+            [-4.5013e-03,  1.6082e-03,  1.4420e-02],
+            [ 1.3673e-02,  8.8877e-03,  4.1253e-03],
+            [-1.0145e-02,  9.0072e-03,  1.5695e-02],
+            [-5.6234e-03,  1.1847e-03,  8.1261e-03],
+            [-3.7171e-03, -5.3538e-03,  1.2590e-03],
+            [ 2.9476e-02,  2.1424e-02,  3.0424e-02],
+            [-3.4925e-02, -2.4340e-02, -2.5316e-02],
+            [-3.4127e-02, -2.2406e-02, -1.0589e-02],
+            [-1.7342e-02, -1.3249e-02, -1.0719e-02],
+            [-2.1478e-03, -8.6051e-03, -2.9878e-03],
+            [ 1.2089e-03, -4.2391e-03, -6.8569e-03],
+            [ 9.0411e-04, -6.6886e-03, -6.7547e-05],
+            [ 1.6048e-02, -1.0057e-02, -2.8929e-02],
+            [ 1.2290e-03,  1.0163e-02,  1.8861e-02],
+            [ 1.7264e-02,  2.7257e-04,  1.3785e-02],
+            [-1.3482e-02, -3.6427e-03,  6.7481e-04],
+            [ 4.6782e-03, -5.2423e-03,  2.4467e-03],
+            [-5.9113e-03, -6.2244e-03, -1.8162e-03],
+            [ 1.5496e-02,  1.4582e-02,  1.9514e-03],
+            [ 7.4958e-03,  1.5886e-03, -8.2305e-03],
+            [ 1.9086e-02,  1.6360e-03, -3.9674e-03],
+            [-5.7021e-03, -2.7307e-03, -4.1066e-03],
+            [ 1.7450e-03,  1.4602e-02,  2.5794e-02],
+            [-8.2788e-04,  2.2902e-03,  4.5161e-03],
+            [ 1.1632e-02,  8.9193e-03, -7.2813e-03],
+            [ 7.5721e-03,  2.6784e-03,  1.1393e-02],
+            [ 5.1939e-03,  3.6903e-03,  1.4049e-02],
+            [-1.8383e-02, -2.2529e-02, -2.4477e-02],
+            [ 5.8842e-04, -5.7874e-03, -1.4770e-02],
+            [-1.6125e-02, -8.6101e-03, -1.4533e-02],
+            [ 2.0540e-02,  2.0729e-02,  6.4338e-03],
+            [ 3.3587e-03, -1.1226e-02, -1.6444e-02],
+            [-1.4742e-03, -1.0489e-02,  1.7097e-03],
+            [ 2.8130e-02,  2.3546e-02,  3.2791e-02],
+            [-1.8532e-02, -1.2842e-02, -8.7756e-03],
+            [-8.0533e-03, -1.0771e-02, -1.7536e-02],
+            [-3.9009e-03,  1.6150e-02,  3.3359e-02],
+            [-7.4554e-03, -1.4154e-02, -6.1910e-03],
+            [ 3.4734e-03, -1.1370e-02, -1.0581e-02],
+            [ 1.1476e-02,  3.9281e-03,  2.8231e-03],
+            [ 7.1639e-03, -1.4741e-03, -3.8066e-03],
+            [ 2.2250e-03, -8.7552e-03, -9.5719e-03],
+            [ 2.4146e-02,  2.1696e-02,  2.8056e-02],
+            [-5.4365e-03, -2.4291e-02, -1.7802e-02],
+            [ 7.4263e-03,  1.0510e-02,  1.2705e-02],
+            [ 6.2669e-03,  6.2658e-03,  1.9211e-02],
+            [ 1.6378e-02,  9.4933e-03,  6.6971e-03],
+            [ 1.7173e-02,  2.3601e-02,  2.3296e-02],
+            [-1.4568e-02, -9.8279e-03, -1.1556e-02],
+            [ 1.4431e-02,  1.4430e-02,  6.6362e-03],
+            [-6.8230e-03,  1.8863e-02,  1.4555e-02],
+            [ 6.1156e-03,  3.4700e-03, -2.6662e-03],
+            [-2.6983e-03, -5.9402e-03, -9.2276e-03],
+            [ 1.0235e-02,  7.4173e-03, -7.6243e-03],
+            [-1.3255e-02,  1.9322e-02, -9.2153e-04],
+            [ 2.4222e-03, -4.8039e-03, -1.5759e-02],
+            [ 2.6244e-02,  2.5951e-02,  2.0249e-02],
+            [ 1.5711e-02,  1.8498e-02,  2.7407e-03],
+            [-2.1714e-03,  4.7214e-03, -2.2443e-02],
+            [-7.4747e-03,  7.4166e-03,  1.4430e-02],
+            [-8.3906e-03, -7.9776e-03,  9.7927e-03],
+            [ 3.8321e-02,  9.6622e-03, -1.9268e-02],
+            [-1.4605e-02, -6.7032e-03,  3.9675e-03]
+        ]
+
+        self.latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]
+
+class HunyuanVideo(LatentFormat):
+    latent_channels = 16
+    latent_dimensions = 3
+    scale_factor = 0.476986
+    latent_rgb_factors = [
+        [-0.0395, -0.0331,  0.0445],
+        [ 0.0696,  0.0795,  0.0518],
+        [ 0.0135, -0.0945, -0.0282],
+        [ 0.0108, -0.0250, -0.0765],
+        [-0.0209,  0.0032,  0.0224],
+        [-0.0804, -0.0254, -0.0639],
+        [-0.0991,  0.0271, -0.0669],
+        [-0.0646, -0.0422, -0.0400],
+        [-0.0696, -0.0595, -0.0894],
+        [-0.0799, -0.0208, -0.0375],
+        [ 0.1166,  0.1627,  0.0962],
+        [ 0.1165,  0.0432,  0.0407],
+        [-0.2315, -0.1920, -0.1355],
+        [-0.0270,  0.0401, -0.0821],
+        [-0.0616, -0.0997, -0.0727],
+        [ 0.0249, -0.0469, -0.1703]
+    ]
+
+    latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
+
+class Cosmos1CV8x8x8(LatentFormat):
+    latent_channels = 16
+    latent_dimensions = 3
+
+    latent_rgb_factors = [
+        [ 0.1817,  0.2284,  0.2423],
+        [-0.0586, -0.0862, -0.3108],
+        [-0.4703, -0.4255, -0.3995],
+        [ 0.0803,  0.1963,  0.1001],
+        [-0.0820, -0.1050,  0.0400],
+        [ 0.2511,  0.3098,  0.2787],
+        [-0.1830, -0.2117, -0.0040],
+        [-0.0621, -0.2187, -0.0939],
+        [ 0.3619,  0.1082,  0.1455],
+        [ 0.3164,  0.3922,  0.2575],
+        [ 0.1152,  0.0231, -0.0462],
+        [-0.1434, -0.3609, -0.3665],
+        [ 0.0635,  0.1471,  0.1680],
+        [-0.3635, -0.1963, -0.3248],
+        [-0.1865,  0.0365,  0.2346],
+        [ 0.0447,  0.0994,  0.0881]
+    ]
+
+    latent_rgb_factors_bias = [-0.1223, -0.1889, -0.1976]
+
+class Wan21(LatentFormat):
+    latent_channels = 16
+    latent_dimensions = 3
+
+    latent_rgb_factors = [
+            [-0.1299, -0.1692,  0.2932],
+            [ 0.0671,  0.0406,  0.0442],
+            [ 0.3568,  0.2548,  0.1747],
+            [ 0.0372,  0.2344,  0.1420],
+            [ 0.0313,  0.0189, -0.0328],
+            [ 0.0296, -0.0956, -0.0665],
+            [-0.3477, -0.4059, -0.2925],
+            [ 0.0166,  0.1902,  0.1975],
+            [-0.0412,  0.0267, -0.1364],
+            [-0.1293,  0.0740,  0.1636],
+            [ 0.0680,  0.3019,  0.1128],
+            [ 0.0032,  0.0581,  0.0639],
+            [-0.1251,  0.0927,  0.1699],
+            [ 0.0060, -0.0633,  0.0005],
+            [ 0.3477,  0.2275,  0.2950],
+            [ 0.1984,  0.0913,  0.1861]
+        ]
+
+    latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
+
+    def __init__(self):
+        self.scale_factor = 1.0
+        self.latents_mean = torch.tensor([
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]).view(1, self.latent_channels, 1, 1, 1)
+        self.latents_std = torch.tensor([
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]).view(1, self.latent_channels, 1, 1, 1)
+
+
+        self.taesd_decoder_name = None #TODO
+
+    def process_in(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return (latent - latents_mean) * self.scale_factor / latents_std
+
+    def process_out(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return latent * latents_std / self.scale_factor + latents_mean
+
+class Hunyuan3Dv2(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 1
+    scale_factor = 0.9990943042622529
+
+class Hunyuan3Dv2mini(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 1
+    scale_factor = 1.0188137142395404
+
+class ACEAudio(LatentFormat):
+    latent_channels = 8
+    latent_dimensions = 2
--- a/comfy/ldm/ace/attention.py
+++ b/comfy/ldm/ace/attention.py
@@ -0,0 +1,761 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/attention.py
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple, Union, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        kv_heads: Optional[int] = None,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        processor=None,
+        out_dim: int = None,
+        out_context_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+        elementwise_affine: bool = True,
+        is_causal: bool = False,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.is_causal = is_causal
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+
+        self.group_norm = None
+        self.spatial_norm = None
+
+        self.norm_q = None
+        self.norm_k = None
+
+        self.norm_cross = None
+        self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
+            self.to_v = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
+        else:
+            self.to_k = None
+            self.to_v = None
+
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
+            self.add_v_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
+            if self.context_pre_only is not None:
+                self.add_q_proj = operations.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias, dtype=dtype, device=device)
+        else:
+            self.add_q_proj = None
+            self.add_k_proj = None
+            self.add_v_proj = None
+
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device))
+            self.to_out.append(nn.Dropout(dropout))
+        else:
+            self.to_out = None
+
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device)
+        else:
+            self.to_add_out = None
+
+        self.norm_added_q = None
+        self.norm_added_k = None
+        self.processor = processor
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+
+class CustomLiteLAProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE"""
+
+    def __init__(self):
+        self.kernel_func = nn.ReLU(inplace=False)
+        self.eps = 1e-15
+        self.pad_val = 1.0
+
+    def apply_rotary_emb(
+        self,
+        x: torch.Tensor,
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+        tensors contain rotary embeddings and are returned as real tensors.
+
+        Args:
+            x (`torch.Tensor`):
+                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+        """
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+
+        return out
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        hidden_states_len = hidden_states.shape[1]
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        if encoder_hidden_states is not None:
+            context_input_ndim = encoder_hidden_states.ndim
+            if context_input_ndim == 4:
+                batch_size, channel, height, width = encoder_hidden_states.shape
+                encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        dtype = hidden_states.dtype
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        # `context` projections.
+        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
+        if encoder_hidden_states is not None and has_encoder_hidden_state_proj:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+            # attention
+            if not attn.is_cross_attention:
+                query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+                key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+                value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+            else:
+                query = hidden_states
+                key = encoder_hidden_states
+                value = encoder_hidden_states
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
+        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+
+        # RoPE需要 [B, H, S, D] 输入
+        # 此时 query是 [B, H, D, S], 需要转成 [B, H, S, D] 才能应用RoPE
+        query = query.permute(0, 1, 3, 2)  # [B, H, S, D]  (从 [B, H, D, S])
+
+        # Apply query and key normalization if needed
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if rotary_freqs_cis is not None:
+            query = self.apply_rotary_emb(query, rotary_freqs_cis)
+            if not attn.is_cross_attention:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis)
+            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
+
+        # 此时 query是 [B, H, S, D]，需要还原成 [B, H, D, S]
+        query = query.permute(0, 1, 3, 2)  # [B, H, D, S]
+
+        if attention_mask is not None:
+            # attention_mask: [B, S] -> [B, 1, S, 1]
+            attention_mask = attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S, 1]
+            query = query * attention_mask.permute(0, 1, 3, 2)  # [B, H, S, D] * [B, 1, S, 1]
+            if not attn.is_cross_attention:
+                key = key * attention_mask  # key: [B, h, S, D] 与 mask [B, 1, S, 1] 相乘
+                value = value * attention_mask.permute(0, 1, 3, 2)  # 如果 value 是 [B, h, D, S]，那么需调整mask以匹配S维度
+
+        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
+            encoder_attention_mask = encoder_attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S_enc, 1]
+            # 此时 key: [B, h, S_enc, D], value: [B, h, D, S_enc]
+            key = key * encoder_attention_mask  # [B, h, S_enc, D] * [B, 1, S_enc, 1]
+            value = value * encoder_attention_mask.permute(0, 1, 3, 2)  # [B, h, D, S_enc] * [B, 1, 1, S_enc]
+
+        query = self.kernel_func(query)
+        key = self.kernel_func(key)
+
+        query, key, value = query.float(), key.float(), value.float()
+
+        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
+
+        vk = torch.matmul(value, key)
+
+        hidden_states = torch.matmul(vk, query)
+
+        if hidden_states.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.float()
+
+        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
+
+        hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
+
+        hidden_states = hidden_states.to(dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = encoder_hidden_states.to(dtype)
+
+        # Split the attention outputs.
+        if encoder_hidden_states is not None and not attn.is_cross_attention and has_encoder_hidden_state_proj:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : hidden_states_len],
+                hidden_states[:, hidden_states_len:],
+            )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if encoder_hidden_states is not None and not attn.context_pre_only and not attn.is_cross_attention and hasattr(attn, "to_add_out"):
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if encoder_hidden_states is not None and context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if torch.get_autocast_gpu_dtype() == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+            if encoder_hidden_states is not None:
+                encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+
+        return hidden_states, encoder_hidden_states
+
+
+class CustomerAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def apply_rotary_emb(
+        self,
+        x: torch.Tensor,
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+        tensors contain rotary embeddings and are returned as real tensors.
+
+        Args:
+            x (`torch.Tensor`):
+                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+        """
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+
+        return out
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if rotary_freqs_cis is not None:
+            query = self.apply_rotary_emb(query, rotary_freqs_cis)
+            if not attn.is_cross_attention:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis)
+            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
+
+        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
+            # attention_mask: N x S1
+            # encoder_attention_mask: N x S2
+            # cross attention 整合attention_mask和encoder_attention_mask
+            combined_mask = attention_mask[:, :, None] * encoder_attention_mask[:, None, :]
+            attention_mask = torch.where(combined_mask == 1, 0.0, -torch.inf)
+            attention_mask = attention_mask[:, None, :, :].expand(-1, attn.heads, -1, -1).to(query.dtype)
+
+        elif not attn.is_cross_attention and attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        hidden_states = optimized_attention(
+            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True,
+        ).to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+def val2list(x: list or tuple or any, repeat_time=1) -> list:  # type: ignore
+    """Repeat `val` for `repeat_time` times and return the list or val if list/tuple."""
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    return [x for _ in range(repeat_time)]
+
+
+def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple:  # type: ignore
+    """Return tuple with min_len by repeating element at idx_repeat."""
+    # convert to list first
+    x = val2list(x)
+
+    # repeat elements if necessary
+    if len(x) > 0:
+        x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
+
+    return tuple(x)
+
+
+def t2i_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+
+
+def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]:
+    if isinstance(kernel_size, tuple):
+        return tuple([get_same_padding(ks) for ks in kernel_size])
+    else:
+        assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number"
+        return kernel_size // 2
+
+class ConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        groups=1,
+        padding: Union[int, None] = None,
+        use_bias=False,
+        norm=None,
+        act=None,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        if padding is None:
+            padding = get_same_padding(kernel_size)
+            padding *= dilation
+
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.padding = padding
+        self.use_bias = use_bias
+
+        self.conv = operations.Conv1d(
+            in_dim,
+            out_dim,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=use_bias,
+            device=device,
+            dtype=dtype
+        )
+        if norm is not None:
+            self.norm = operations.RMSNorm(out_dim, elementwise_affine=False, dtype=dtype, device=device)
+        else:
+            self.norm = None
+        if act is not None:
+            self.act = nn.SiLU(inplace=True)
+        else:
+            self.act = None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        if self.norm:
+            x = self.norm(x)
+        if self.act:
+            x = self.act(x)
+        return x
+
+
+class GLUMBConv(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_feature=None,
+        kernel_size=3,
+        stride=1,
+        padding: Union[int, None] = None,
+        use_bias=False,
+        norm=(None, None, None),
+        act=("silu", "silu", None),
+        dilation=1,
+        dtype=None, device=None, operations=None
+    ):
+        out_feature = out_feature or in_features
+        super().__init__()
+        use_bias = val2tuple(use_bias, 3)
+        norm = val2tuple(norm, 3)
+        act = val2tuple(act, 3)
+
+        self.glu_act = nn.SiLU(inplace=False)
+        self.inverted_conv = ConvLayer(
+            in_features,
+            hidden_features * 2,
+            1,
+            use_bias=use_bias[0],
+            norm=norm[0],
+            act=act[0],
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.depth_conv = ConvLayer(
+            hidden_features * 2,
+            hidden_features * 2,
+            kernel_size,
+            stride=stride,
+            groups=hidden_features * 2,
+            padding=padding,
+            use_bias=use_bias[1],
+            norm=norm[1],
+            act=None,
+            dilation=dilation,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.point_conv = ConvLayer(
+            hidden_features,
+            out_feature,
+            1,
+            use_bias=use_bias[2],
+            norm=norm[2],
+            act=act[2],
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.transpose(1, 2)
+        x = self.inverted_conv(x)
+        x = self.depth_conv(x)
+
+        x, gate = torch.chunk(x, 2, dim=1)
+        gate = self.glu_act(gate)
+        x = x * gate
+
+        x = self.point_conv(x)
+        x = x.transpose(1, 2)
+
+        return x
+
+
+class LinearTransformerBlock(nn.Module):
+    """
+    A Sana block with global shared adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        use_adaln_single=True,
+        cross_attention_dim=None,
+        added_kv_proj_dim=None,
+        context_pre_only=False,
+        mlp_ratio=4.0,
+        add_cross_attention=False,
+        add_cross_attention_dim=None,
+        qk_norm=None,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+
+        self.norm1 = operations.RMSNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            added_kv_proj_dim=added_kv_proj_dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            qk_norm=qk_norm,
+            processor=CustomLiteLAProcessor2_0(),
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        self.add_cross_attention = add_cross_attention
+        self.context_pre_only = context_pre_only
+
+        if add_cross_attention and add_cross_attention_dim is not None:
+            self.cross_attn = Attention(
+                query_dim=dim,
+                cross_attention_dim=add_cross_attention_dim,
+                added_kv_proj_dim=add_cross_attention_dim,
+                dim_head=attention_head_dim,
+                heads=num_attention_heads,
+                out_dim=dim,
+                context_pre_only=context_pre_only,
+                bias=True,
+                qk_norm=qk_norm,
+                processor=CustomerAttnProcessor2_0(),
+                dtype=dtype,
+                device=device,
+                operations=operations,
+            )
+
+        self.norm2 = operations.RMSNorm(dim, 1e-06, elementwise_affine=False)
+
+        self.ff = GLUMBConv(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            use_bias=(True, True, False),
+            norm=(None, None, None),
+            act=("silu", "silu", None),
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.use_adaln_single = use_adaln_single
+        if use_adaln_single:
+            self.scale_shift_table = nn.Parameter(torch.empty(6, dim, dtype=dtype, device=device))
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: torch.FloatTensor = None,
+        encoder_attention_mask: torch.FloatTensor = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        temb: torch.FloatTensor = None,
+    ):
+
+        N = hidden_states.shape[0]
+
+        # step 1: AdaLN single
+        if self.use_adaln_single:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                comfy.model_management.cast_to(self.scale_shift_table[None], dtype=temb.dtype, device=temb.device) + temb.reshape(N, 6, -1)
+            ).chunk(6, dim=1)
+
+        norm_hidden_states = self.norm1(hidden_states)
+        if self.use_adaln_single:
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+
+        # step 2: attention
+        if not self.add_cross_attention:
+            attn_output, encoder_hidden_states = self.attn(
+                hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
+            )
+        else:
+            attn_output, _ = self.attn(
+                hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=None,
+            )
+
+        if self.use_adaln_single:
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+
+        if self.add_cross_attention:
+            attn_output = self.cross_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
+            )
+            hidden_states = attn_output + hidden_states
+
+        # step 3: add norm
+        norm_hidden_states = self.norm2(hidden_states)
+        if self.use_adaln_single:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        # step 4: feed forward
+        ff_output = self.ff(norm_hidden_states)
+        if self.use_adaln_single:
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = hidden_states + ff_output
+
+        return hidden_states
--- a/comfy/ldm/ace/lyric_encoder.py
+++ b/comfy/ldm/ace/lyric_encoder.py
--- a/comfy/ldm/ace/model.py
+++ b/comfy/ldm/ace/model.py
@@ -0,0 +1,385 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/ace_step_transformer.py
+
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, List, Union
+
+import torch
+from torch import nn
+
+import comfy.model_management
+
+from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
+from .attention import LinearTransformerBlock, t2i_modulate
+from .lyric_encoder import ConformerEncoder as LyricEncoder
+
+
+def cross_norm(hidden_states, controlnet_input):
+    # input N x T x c
+    mean_hidden_states, std_hidden_states = hidden_states.mean(dim=(1,2), keepdim=True), hidden_states.std(dim=(1,2), keepdim=True)
+    mean_controlnet_input, std_controlnet_input = controlnet_input.mean(dim=(1,2), keepdim=True), controlnet_input.std(dim=(1,2), keepdim=True)
+    controlnet_input = (controlnet_input - mean_controlnet_input) * (std_hidden_states / (std_controlnet_input + 1e-12)) + mean_hidden_states
+    return controlnet_input
+
+
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, dtype=None, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=device).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class T2IFinalLayer(nn.Module):
+    """
+    The final layer of Sana.
+    """
+
+    def __init__(self, hidden_size, patch_size=[16, 1], out_channels=256, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, patch_size[0] * patch_size[1] * out_channels, bias=True, dtype=dtype, device=device)
+        self.scale_shift_table = nn.Parameter(torch.empty(2, hidden_size, dtype=dtype, device=device))
+        self.out_channels = out_channels
+        self.patch_size = patch_size
+
+    def unpatchfy(
+        self,
+        hidden_states: torch.Tensor,
+        width: int,
+    ):
+        # 4 unpatchify
+        new_height, new_width = 1, hidden_states.size(1)
+        hidden_states = hidden_states.reshape(
+            shape=(hidden_states.shape[0], new_height, new_width, self.patch_size[0], self.patch_size[1], self.out_channels)
+        ).contiguous()
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(hidden_states.shape[0], self.out_channels, new_height * self.patch_size[0], new_width * self.patch_size[1])
+        ).contiguous()
+        if width > new_width:
+            output = torch.nn.functional.pad(output, (0, width - new_width, 0, 0), 'constant', 0)
+        elif width < new_width:
+            output = output[:, :, :, :width]
+        return output
+
+    def forward(self, x, t, output_length):
+        shift, scale = (comfy.model_management.cast_to(self.scale_shift_table[None], device=t.device, dtype=t.dtype) + t[:, None]).chunk(2, dim=1)
+        x = t2i_modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        # unpatchify
+        output = self.unpatchfy(x, output_length)
+        return output
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        height=16,
+        width=4096,
+        patch_size=(16, 1),
+        in_channels=8,
+        embed_dim=1152,
+        bias=True,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        patch_size_h, patch_size_w = patch_size
+        self.early_conv_layers = nn.Sequential(
+            operations.Conv2d(in_channels, in_channels*256, kernel_size=patch_size, stride=patch_size, padding=0, bias=bias, dtype=dtype, device=device),
+            operations.GroupNorm(num_groups=32, num_channels=in_channels*256, eps=1e-6, affine=True, dtype=dtype, device=device),
+            operations.Conv2d(in_channels*256, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias, dtype=dtype, device=device)
+        )
+        self.patch_size = patch_size
+        self.height, self.width = height // patch_size_h, width // patch_size_w
+        self.base_size = self.width
+
+    def forward(self, latent):
+        # early convolutions, N x C x H x W -> N x 256 * sqrt(patch_size) x H/patch_size x W/patch_size
+        latent = self.early_conv_layers(latent)
+        latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        return latent
+
+
+class ACEStepTransformer2DModel(nn.Module):
+    # _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: Optional[int] = 8,
+        num_layers: int = 28,
+        inner_dim: int = 1536,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 24,
+        mlp_ratio: float = 4.0,
+        out_channels: int = 8,
+        max_position: int = 32768,
+        rope_theta: float = 1000000.0,
+        speaker_embedding_dim: int = 512,
+        text_embedding_dim: int = 768,
+        ssl_encoder_depths: List[int] = [9, 9],
+        ssl_names: List[str] = ["mert", "m-hubert"],
+        ssl_latent_dims: List[int] = [1024, 768],
+        lyric_encoder_vocab_size: int = 6681,
+        lyric_hidden_size: int = 1024,
+        patch_size: List[int] = [16, 1],
+        max_height: int = 16,
+        max_width: int = 4096,
+        audio_model=None,
+        dtype=None, device=None, operations=None
+
+    ):
+        super().__init__()
+
+        self.dtype = dtype
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+        self.out_channels = out_channels
+        self.max_position = max_position
+        self.patch_size = patch_size
+
+        self.rope_theta = rope_theta
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            dim=self.attention_head_dim,
+            max_position_embeddings=self.max_position,
+            base=self.rope_theta,
+            dtype=dtype,
+            device=device,
+        )
+
+        # 2. Define input layers
+        self.in_channels = in_channels
+
+        self.num_layers = num_layers
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                LinearTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    add_cross_attention=True,
+                    add_cross_attention_dim=self.inner_dim,
+                    dtype=dtype,
+                    device=device,
+                    operations=operations,
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim, dtype=dtype, device=device, operations=operations)
+        self.t_block = nn.Sequential(nn.SiLU(), operations.Linear(self.inner_dim, 6 * self.inner_dim, bias=True, dtype=dtype, device=device))
+
+        # speaker
+        self.speaker_embedder = operations.Linear(speaker_embedding_dim, self.inner_dim, dtype=dtype, device=device)
+
+        # genre
+        self.genre_embedder = operations.Linear(text_embedding_dim, self.inner_dim, dtype=dtype, device=device)
+
+        # lyric
+        self.lyric_embs = operations.Embedding(lyric_encoder_vocab_size, lyric_hidden_size, dtype=dtype, device=device)
+        self.lyric_encoder = LyricEncoder(input_size=lyric_hidden_size, static_chunk_size=0, dtype=dtype, device=device, operations=operations)
+        self.lyric_proj = operations.Linear(lyric_hidden_size, self.inner_dim, dtype=dtype, device=device)
+
+        projector_dim = 2 * self.inner_dim
+
+        self.projectors = nn.ModuleList([
+            nn.Sequential(
+                operations.Linear(self.inner_dim, projector_dim, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Linear(projector_dim, projector_dim, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Linear(projector_dim, ssl_dim, dtype=dtype, device=device),
+            ) for ssl_dim in ssl_latent_dims
+        ])
+
+        self.proj_in = PatchEmbed(
+            height=max_height,
+            width=max_width,
+            patch_size=patch_size,
+            embed_dim=self.inner_dim,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        self.final_layer = T2IFinalLayer(self.inner_dim, patch_size=patch_size, out_channels=out_channels, dtype=dtype, device=device, operations=operations)
+
+    def forward_lyric_encoder(
+        self,
+        lyric_token_idx: Optional[torch.LongTensor] = None,
+        lyric_mask: Optional[torch.LongTensor] = None,
+        out_dtype=None,
+    ):
+        # N x T x D
+        lyric_embs = self.lyric_embs(lyric_token_idx, out_dtype=out_dtype)
+        prompt_prenet_out, _mask = self.lyric_encoder(lyric_embs, lyric_mask, decoding_chunk_size=1, num_decoding_left_chunks=-1)
+        prompt_prenet_out = self.lyric_proj(prompt_prenet_out)
+        return prompt_prenet_out
+
+    def encode(
+        self,
+        encoder_text_hidden_states: Optional[torch.Tensor] = None,
+        text_attention_mask: Optional[torch.LongTensor] = None,
+        speaker_embeds: Optional[torch.FloatTensor] = None,
+        lyric_token_idx: Optional[torch.LongTensor] = None,
+        lyric_mask: Optional[torch.LongTensor] = None,
+        lyrics_strength=1.0,
+    ):
+
+        bs = encoder_text_hidden_states.shape[0]
+        device = encoder_text_hidden_states.device
+
+        # speaker embedding
+        encoder_spk_hidden_states = self.speaker_embedder(speaker_embeds).unsqueeze(1)
+
+        # genre embedding
+        encoder_text_hidden_states = self.genre_embedder(encoder_text_hidden_states)
+
+        # lyric
+        encoder_lyric_hidden_states = self.forward_lyric_encoder(
+            lyric_token_idx=lyric_token_idx,
+            lyric_mask=lyric_mask,
+            out_dtype=encoder_text_hidden_states.dtype,
+        )
+
+        encoder_lyric_hidden_states *= lyrics_strength
+
+        encoder_hidden_states = torch.cat([encoder_spk_hidden_states, encoder_text_hidden_states, encoder_lyric_hidden_states], dim=1)
+
+        encoder_hidden_mask = None
+        if text_attention_mask is not None:
+            speaker_mask = torch.ones(bs, 1, device=device)
+            encoder_hidden_mask = torch.cat([speaker_mask, text_attention_mask, lyric_mask], dim=1)
+
+        return encoder_hidden_states, encoder_hidden_mask
+
+    def decode(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_mask: torch.Tensor,
+        timestep: Optional[torch.Tensor],
+        output_length: int = 0,
+        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
+        controlnet_scale: Union[float, torch.Tensor] = 1.0,
+    ):
+        embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
+        temb = self.t_block(embedded_timestep)
+
+        hidden_states = self.proj_in(hidden_states)
+
+        # controlnet logic
+        if block_controlnet_hidden_states is not None:
+            control_condi = cross_norm(hidden_states, block_controlnet_hidden_states)
+            hidden_states = hidden_states + control_condi * controlnet_scale
+
+        # inner_hidden_states = []
+
+        rotary_freqs_cis = self.rotary_emb(hidden_states, seq_len=hidden_states.shape[1])
+        encoder_rotary_freqs_cis = self.rotary_emb(encoder_hidden_states, seq_len=encoder_hidden_states.shape[1])
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            hidden_states = block(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_hidden_mask,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
+                temb=temb,
+            )
+
+        output = self.final_layer(hidden_states, embedded_timestep, output_length)
+        return output
+
+    def forward(
+        self,
+        x,
+        timestep,
+        attention_mask=None,
+        context: Optional[torch.Tensor] = None,
+        text_attention_mask: Optional[torch.LongTensor] = None,
+        speaker_embeds: Optional[torch.FloatTensor] = None,
+        lyric_token_idx: Optional[torch.LongTensor] = None,
+        lyric_mask: Optional[torch.LongTensor] = None,
+        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
+        controlnet_scale: Union[float, torch.Tensor] = 1.0,
+        lyrics_strength=1.0,
+        **kwargs
+    ):
+        hidden_states = x
+        encoder_text_hidden_states = context
+        encoder_hidden_states, encoder_hidden_mask = self.encode(
+            encoder_text_hidden_states=encoder_text_hidden_states,
+            text_attention_mask=text_attention_mask,
+            speaker_embeds=speaker_embeds,
+            lyric_token_idx=lyric_token_idx,
+            lyric_mask=lyric_mask,
+            lyrics_strength=lyrics_strength,
+        )
+
+        output_length = hidden_states.shape[-1]
+
+        output = self.decode(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_hidden_mask=encoder_hidden_mask,
+            timestep=timestep,
+            output_length=output_length,
+            block_controlnet_hidden_states=block_controlnet_hidden_states,
+            controlnet_scale=controlnet_scale,
+        )
+
+        return output
--- a/comfy/ldm/ace/vae/autoencoder_dc.py
+++ b/comfy/ldm/ace/vae/autoencoder_dc.py
@@ -0,0 +1,644 @@
+# Rewritten from diffusers
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Union
+
+import comfy.model_management
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+
+class RMSNorm(ops.RMSNorm):
+    def __init__(self, dim, eps=1e-5, elementwise_affine=True, bias=False):
+        super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine)
+        if elementwise_affine:
+            self.bias = nn.Parameter(torch.empty(dim)) if bias else None
+
+    def forward(self, x):
+        x = super().forward(x)
+        if self.elementwise_affine:
+            if self.bias is not None:
+                x = x + comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device)
+        return x
+
+
+def get_normalization(norm_type, num_features, num_groups=32, eps=1e-5):
+    if norm_type == "batch_norm":
+        return nn.BatchNorm2d(num_features)
+    elif norm_type == "group_norm":
+        return ops.GroupNorm(num_groups, num_features)
+    elif norm_type == "layer_norm":
+        return ops.LayerNorm(num_features)
+    elif norm_type == "rms_norm":
+        return RMSNorm(num_features, eps=eps, elementwise_affine=True, bias=True)
+    else:
+        raise ValueError(f"Unknown normalization type: {norm_type}")
+
+
+def get_activation(activation_type):
+    if activation_type == "relu":
+        return nn.ReLU()
+    elif activation_type == "relu6":
+        return nn.ReLU6()
+    elif activation_type == "silu":
+        return nn.SiLU()
+    elif activation_type == "leaky_relu":
+        return nn.LeakyReLU(0.2)
+    else:
+        raise ValueError(f"Unknown activation type: {activation_type}")
+
+
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_type: str = "batch_norm",
+        act_fn: str = "relu6",
+    ) -> None:
+        super().__init__()
+
+        self.norm_type = norm_type
+        self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
+        self.conv1 = ops.Conv2d(in_channels, in_channels, 3, 1, 1)
+        self.conv2 = ops.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
+        self.norm = get_normalization(norm_type, out_channels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.norm_type == "rms_norm":
+            # move channel to the last dimension so we apply RMSnorm across channel dimension
+            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states + residual
+
+class SanaMultiscaleAttentionProjection(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_attention_heads: int,
+        kernel_size: int,
+    ) -> None:
+        super().__init__()
+
+        channels = 3 * in_channels
+        self.proj_in = ops.Conv2d(
+            channels,
+            channels,
+            kernel_size,
+            padding=kernel_size // 2,
+            groups=channels,
+            bias=False,
+        )
+        self.proj_out = ops.Conv2d(channels, channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        return hidden_states
+
+class SanaMultiscaleLinearAttention(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_attention_heads: int = None,
+        attention_head_dim: int = 8,
+        mult: float = 1.0,
+        norm_type: str = "batch_norm",
+        kernel_sizes: tuple = (5,),
+        eps: float = 1e-15,
+        residual_connection: bool = False,
+    ):
+        super().__init__()
+
+        self.eps = eps
+        self.attention_head_dim = attention_head_dim
+        self.norm_type = norm_type
+        self.residual_connection = residual_connection
+
+        num_attention_heads = (
+            int(in_channels // attention_head_dim * mult)
+            if num_attention_heads is None
+            else num_attention_heads
+        )
+        inner_dim = num_attention_heads * attention_head_dim
+
+        self.to_q = ops.Linear(in_channels, inner_dim, bias=False)
+        self.to_k = ops.Linear(in_channels, inner_dim, bias=False)
+        self.to_v = ops.Linear(in_channels, inner_dim, bias=False)
+
+        self.to_qkv_multiscale = nn.ModuleList()
+        for kernel_size in kernel_sizes:
+            self.to_qkv_multiscale.append(
+                SanaMultiscaleAttentionProjection(inner_dim, num_attention_heads, kernel_size)
+            )
+
+        self.nonlinearity = nn.ReLU()
+        self.to_out = ops.Linear(inner_dim * (1 + len(kernel_sizes)), out_channels, bias=False)
+        self.norm_out = get_normalization(norm_type, out_channels)
+
+    def apply_linear_attention(self, query, key, value):
+        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1)
+        scores = torch.matmul(value, key.transpose(-1, -2))
+        hidden_states = torch.matmul(scores, query)
+
+        hidden_states = hidden_states.to(dtype=torch.float32)
+        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
+        return hidden_states
+
+    def apply_quadratic_attention(self, query, key, value):
+        scores = torch.matmul(key.transpose(-1, -2), query)
+        scores = scores.to(dtype=torch.float32)
+        scores = scores / (torch.sum(scores, dim=2, keepdim=True) + self.eps)
+        hidden_states = torch.matmul(value, scores.to(value.dtype))
+        return hidden_states
+
+    def forward(self, hidden_states):
+        height, width = hidden_states.shape[-2:]
+        if height * width > self.attention_head_dim:
+            use_linear_attention = True
+        else:
+            use_linear_attention = False
+
+        residual = hidden_states
+
+        batch_size, _, height, width = list(hidden_states.size())
+        original_dtype = hidden_states.dtype
+
+        hidden_states = hidden_states.movedim(1, -1)
+        query = self.to_q(hidden_states)
+        key = self.to_k(hidden_states)
+        value = self.to_v(hidden_states)
+        hidden_states = torch.cat([query, key, value], dim=3)
+        hidden_states = hidden_states.movedim(-1, 1)
+
+        multi_scale_qkv = [hidden_states]
+        for block in self.to_qkv_multiscale:
+            multi_scale_qkv.append(block(hidden_states))
+
+        hidden_states = torch.cat(multi_scale_qkv, dim=1)
+
+        if use_linear_attention:
+            # for linear attention upcast hidden_states to float32
+            hidden_states = hidden_states.to(dtype=torch.float32)
+
+        hidden_states = hidden_states.reshape(batch_size, -1, 3 * self.attention_head_dim, height * width)
+
+        query, key, value = hidden_states.chunk(3, dim=2)
+        query = self.nonlinearity(query)
+        key = self.nonlinearity(key)
+
+        if use_linear_attention:
+            hidden_states = self.apply_linear_attention(query, key, value)
+            hidden_states = hidden_states.to(dtype=original_dtype)
+        else:
+            hidden_states = self.apply_quadratic_attention(query, key, value)
+
+        hidden_states = torch.reshape(hidden_states, (batch_size, -1, height, width))
+        hidden_states = self.to_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
+
+        if self.norm_type == "rms_norm":
+            hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
+        else:
+            hidden_states = self.norm_out(hidden_states)
+
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class EfficientViTBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        mult: float = 1.0,
+        attention_head_dim: int = 32,
+        qkv_multiscales: tuple = (5,),
+        norm_type: str = "batch_norm",
+    ) -> None:
+        super().__init__()
+
+        self.attn = SanaMultiscaleLinearAttention(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            mult=mult,
+            attention_head_dim=attention_head_dim,
+            norm_type=norm_type,
+            kernel_sizes=qkv_multiscales,
+            residual_connection=True,
+        )
+
+        self.conv_out = GLUMBConv(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            norm_type="rms_norm",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.attn(x)
+        x = self.conv_out(x)
+        return x
+
+
+class GLUMBConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        expand_ratio: float = 4,
+        norm_type: str = None,
+        residual_connection: bool = True,
+    ) -> None:
+        super().__init__()
+
+        hidden_channels = int(expand_ratio * in_channels)
+        self.norm_type = norm_type
+        self.residual_connection = residual_connection
+
+        self.nonlinearity = nn.SiLU()
+        self.conv_inverted = ops.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
+        self.conv_depth = ops.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
+        self.conv_point = ops.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
+
+        self.norm = None
+        if norm_type == "rms_norm":
+            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.residual_connection:
+            residual = hidden_states
+
+        hidden_states = self.conv_inverted(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.conv_depth(hidden_states)
+        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
+        hidden_states = hidden_states * self.nonlinearity(gate)
+
+        hidden_states = self.conv_point(hidden_states)
+
+        if self.norm_type == "rms_norm":
+            # move channel to the last dimension so we apply RMSnorm across channel dimension
+            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
+
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+def get_block(
+    block_type: str,
+    in_channels: int,
+    out_channels: int,
+    attention_head_dim: int,
+    norm_type: str,
+    act_fn: str,
+    qkv_mutliscales: tuple = (),
+):
+    if block_type == "ResBlock":
+        block = ResBlock(in_channels, out_channels, norm_type, act_fn)
+    elif block_type == "EfficientViTBlock":
+        block = EfficientViTBlock(
+            in_channels,
+            attention_head_dim=attention_head_dim,
+            norm_type=norm_type,
+            qkv_multiscales=qkv_mutliscales
+        )
+    else:
+        raise ValueError(f"Block with {block_type=} is not supported.")
+
+    return block
+
+
+class DCDownBlock2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
+        super().__init__()
+
+        self.downsample = downsample
+        self.factor = 2
+        self.stride = 1 if downsample else 2
+        self.group_size = in_channels * self.factor**2 // out_channels
+        self.shortcut = shortcut
+
+        out_ratio = self.factor**2
+        if downsample:
+            assert out_channels % out_ratio == 0
+            out_channels = out_channels // out_ratio
+
+        self.conv = ops.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=self.stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self.conv(hidden_states)
+        if self.downsample:
+            x = F.pixel_unshuffle(x, self.factor)
+
+        if self.shortcut:
+            y = F.pixel_unshuffle(hidden_states, self.factor)
+            y = y.unflatten(1, (-1, self.group_size))
+            y = y.mean(dim=2)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+
+        return hidden_states
+
+
+class DCUpBlock2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        interpolate: bool = False,
+        shortcut: bool = True,
+        interpolation_mode: str = "nearest",
+    ) -> None:
+        super().__init__()
+
+        self.interpolate = interpolate
+        self.interpolation_mode = interpolation_mode
+        self.shortcut = shortcut
+        self.factor = 2
+        self.repeats = out_channels * self.factor**2 // in_channels
+
+        out_ratio = self.factor**2
+        if not interpolate:
+            out_channels = out_channels * out_ratio
+
+        self.conv = ops.Conv2d(in_channels, out_channels, 3, 1, 1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.interpolate:
+            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
+            x = self.conv(x)
+        else:
+            x = self.conv(hidden_states)
+            x = F.pixel_shuffle(x, self.factor)
+
+        if self.shortcut:
+            y = hidden_states.repeat_interleave(self.repeats, dim=1, output_size=hidden_states.shape[1] * self.repeats)
+            y = F.pixel_shuffle(y, self.factor)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+
+        return hidden_states
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        attention_head_dim: int = 32,
+        block_type: str or tuple = "ResBlock",
+        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
+        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
+        downsample_block_type: str = "pixel_unshuffle",
+        out_shortcut: bool = True,
+    ):
+        super().__init__()
+
+        num_blocks = len(block_out_channels)
+
+        if isinstance(block_type, str):
+            block_type = (block_type,) * num_blocks
+
+        if layers_per_block[0] > 0:
+            self.conv_in = ops.Conv2d(
+                in_channels,
+                block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        else:
+            self.conv_in = DCDownBlock2d(
+                in_channels=in_channels,
+                out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
+                downsample=downsample_block_type == "pixel_unshuffle",
+                shortcut=False,
+            )
+
+        down_blocks = []
+        for i, (out_channel, num_layers) in enumerate(zip(block_out_channels, layers_per_block)):
+            down_block_list = []
+
+            for _ in range(num_layers):
+                block = get_block(
+                    block_type[i],
+                    out_channel,
+                    out_channel,
+                    attention_head_dim=attention_head_dim,
+                    norm_type="rms_norm",
+                    act_fn="silu",
+                    qkv_mutliscales=qkv_multiscales[i],
+                )
+                down_block_list.append(block)
+
+            if i < num_blocks - 1 and num_layers > 0:
+                downsample_block = DCDownBlock2d(
+                    in_channels=out_channel,
+                    out_channels=block_out_channels[i + 1],
+                    downsample=downsample_block_type == "pixel_unshuffle",
+                    shortcut=True,
+                )
+                down_block_list.append(downsample_block)
+
+            down_blocks.append(nn.Sequential(*down_block_list))
+
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+        self.conv_out = ops.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
+
+        self.out_shortcut = out_shortcut
+        if out_shortcut:
+            self.out_shortcut_average_group_size = block_out_channels[-1] // latent_channels
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv_in(hidden_states)
+        for down_block in self.down_blocks:
+            hidden_states = down_block(hidden_states)
+
+        if self.out_shortcut:
+            x = hidden_states.unflatten(1, (-1, self.out_shortcut_average_group_size))
+            x = x.mean(dim=2)
+            hidden_states = self.conv_out(hidden_states) + x
+        else:
+            hidden_states = self.conv_out(hidden_states)
+
+        return hidden_states
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        attention_head_dim: int = 32,
+        block_type: str or tuple = "ResBlock",
+        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
+        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
+        norm_type: str or tuple = "rms_norm",
+        act_fn: str or tuple = "silu",
+        upsample_block_type: str = "pixel_shuffle",
+        in_shortcut: bool = True,
+    ):
+        super().__init__()
+
+        num_blocks = len(block_out_channels)
+
+        if isinstance(block_type, str):
+            block_type = (block_type,) * num_blocks
+        if isinstance(norm_type, str):
+            norm_type = (norm_type,) * num_blocks
+        if isinstance(act_fn, str):
+            act_fn = (act_fn,) * num_blocks
+
+        self.conv_in = ops.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
+
+        self.in_shortcut = in_shortcut
+        if in_shortcut:
+            self.in_shortcut_repeats = block_out_channels[-1] // latent_channels
+
+        up_blocks = []
+        for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
+            up_block_list = []
+
+            if i < num_blocks - 1 and num_layers > 0:
+                upsample_block = DCUpBlock2d(
+                    block_out_channels[i + 1],
+                    out_channel,
+                    interpolate=upsample_block_type == "interpolate",
+                    shortcut=True,
+                )
+                up_block_list.append(upsample_block)
+
+            for _ in range(num_layers):
+                block = get_block(
+                    block_type[i],
+                    out_channel,
+                    out_channel,
+                    attention_head_dim=attention_head_dim,
+                    norm_type=norm_type[i],
+                    act_fn=act_fn[i],
+                    qkv_mutliscales=qkv_multiscales[i],
+                )
+                up_block_list.append(block)
+
+            up_blocks.insert(0, nn.Sequential(*up_block_list))
+
+        self.up_blocks = nn.ModuleList(up_blocks)
+
+        channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
+
+        self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
+        self.conv_act = nn.ReLU()
+        self.conv_out = None
+
+        if layers_per_block[0] > 0:
+            self.conv_out = ops.Conv2d(channels, in_channels, 3, 1, 1)
+        else:
+            self.conv_out = DCUpBlock2d(
+                channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.in_shortcut:
+            x = hidden_states.repeat_interleave(
+                self.in_shortcut_repeats, dim=1, output_size=hidden_states.shape[1] * self.in_shortcut_repeats
+            )
+            hidden_states = self.conv_in(hidden_states) + x
+        else:
+            hidden_states = self.conv_in(hidden_states)
+
+        for up_block in reversed(self.up_blocks):
+            hidden_states = up_block(hidden_states)
+
+        hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+
+
+class AutoencoderDC(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 2,
+        latent_channels: int = 8,
+        attention_head_dim: int = 32,
+        encoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
+        decoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
+        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
+        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
+        encoder_layers_per_block: Tuple[int] = (2, 2, 3, 3),
+        decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3),
+        encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
+        decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
+        upsample_block_type: str = "interpolate",
+        downsample_block_type: str = "Conv",
+        decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
+        decoder_act_fns: Union[str, Tuple[str]] = "silu",
+        scaling_factor: float = 0.41407,
+    ) -> None:
+        super().__init__()
+
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            latent_channels=latent_channels,
+            attention_head_dim=attention_head_dim,
+            block_type=encoder_block_types,
+            block_out_channels=encoder_block_out_channels,
+            layers_per_block=encoder_layers_per_block,
+            qkv_multiscales=encoder_qkv_multiscales,
+            downsample_block_type=downsample_block_type,
+        )
+
+        self.decoder = Decoder(
+            in_channels=in_channels,
+            latent_channels=latent_channels,
+            attention_head_dim=attention_head_dim,
+            block_type=decoder_block_types,
+            block_out_channels=decoder_block_out_channels,
+            layers_per_block=decoder_layers_per_block,
+            qkv_multiscales=decoder_qkv_multiscales,
+            norm_type=decoder_norm_types,
+            act_fn=decoder_act_fns,
+            upsample_block_type=upsample_block_type,
+        )
+
+        self.scaling_factor = scaling_factor
+        self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
+
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Internal encoding function."""
+        encoded = self.encoder(x)
+        return encoded * self.scaling_factor
+
+    def decode(self, z: torch.Tensor) -> torch.Tensor:
+        # Scale the latents back
+        z = z / self.scaling_factor
+        decoded = self.decoder(z)
+        return decoded
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = self.encode(x)
+        return self.decode(z)
+
--- a/comfy/ldm/ace/vae/music_dcae_pipeline.py
+++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py
@@ -0,0 +1,109 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_dcae_pipeline.py
+import torch
+from .autoencoder_dc import AutoencoderDC
+import logging
+try:
+    import torchaudio
+except:
+    logging.warning("torchaudio missing, ACE model will be broken")
+
+import torchvision.transforms as transforms
+from .music_vocoder import ADaMoSHiFiGANV1
+
+
+class MusicDCAE(torch.nn.Module):
+    def __init__(self, source_sample_rate=None, dcae_config={}, vocoder_config={}):
+        super(MusicDCAE, self).__init__()
+
+        self.dcae = AutoencoderDC(**dcae_config)
+        self.vocoder = ADaMoSHiFiGANV1(**vocoder_config)
+
+        if source_sample_rate is None:
+            self.source_sample_rate = 48000
+        else:
+            self.source_sample_rate = source_sample_rate
+
+        # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
+
+        self.transform = transforms.Compose([
+            transforms.Normalize(0.5, 0.5),
+        ])
+        self.min_mel_value = -11.0
+        self.max_mel_value = 3.0
+        self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
+        self.mel_chunk_size = 1024
+        self.time_dimention_multiple = 8
+        self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple
+        self.scale_factor = 0.1786
+        self.shift_factor = -1.9091
+
+    def load_audio(self, audio_path):
+        audio, sr = torchaudio.load(audio_path)
+        return audio, sr
+
+    def forward_mel(self, audios):
+        mels = []
+        for i in range(len(audios)):
+            image = self.vocoder.mel_transform(audios[i])
+            mels.append(image)
+        mels = torch.stack(mels)
+        return mels
+
+    @torch.no_grad()
+    def encode(self, audios, audio_lengths=None, sr=None):
+        if audio_lengths is None:
+            audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
+            audio_lengths = audio_lengths.to(audios.device)
+
+        if sr is None:
+            sr = self.source_sample_rate
+
+        if sr != 44100:
+            audios = torchaudio.functional.resample(audios, sr, 44100)
+
+        max_audio_len = audios.shape[-1]
+        if max_audio_len % (8 * 512) != 0:
+            audios = torch.nn.functional.pad(audios, (0, 8 * 512 - max_audio_len % (8 * 512)))
+
+        mels = self.forward_mel(audios)
+        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
+        mels = self.transform(mels)
+        latents = []
+        for mel in mels:
+            latent = self.dcae.encoder(mel.unsqueeze(0))
+            latents.append(latent)
+        latents = torch.cat(latents, dim=0)
+        # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
+        latents = (latents - self.shift_factor) * self.scale_factor
+        return latents
+        # return latents, latent_lengths
+
+    @torch.no_grad()
+    def decode(self, latents, audio_lengths=None, sr=None):
+        latents = latents / self.scale_factor + self.shift_factor
+
+        pred_wavs = []
+
+        for latent in latents:
+            mels = self.dcae.decoder(latent.unsqueeze(0))
+            mels = mels * 0.5 + 0.5
+            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
+            wav = self.vocoder.decode(mels[0]).squeeze(1)
+
+            if sr is not None:
+                # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
+                wav = torchaudio.functional.resample(wav, 44100, sr)
+                # wav = resampler(wav)
+            else:
+                sr = 44100
+            pred_wavs.append(wav)
+
+        if audio_lengths is not None:
+            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
+        return torch.stack(pred_wavs)
+        # return sr, pred_wavs
+
+    def forward(self, audios, audio_lengths=None, sr=None):
+        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
+        sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr)
+        return sr, pred_wavs, latents, latent_lengths
--- a/comfy/ldm/ace/vae/music_log_mel.py
+++ b/comfy/ldm/ace/vae/music_log_mel.py
@@ -0,0 +1,113 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_log_mel.py
+import torch
+import torch.nn as nn
+from torch import Tensor
+import logging
+try:
+    from torchaudio.transforms import MelScale
+except:
+    logging.warning("torchaudio missing, ACE model will be broken")
+
+import comfy.model_management
+
+class LinearSpectrogram(nn.Module):
+    def __init__(
+        self,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        center=False,
+        mode="pow2_sqrt",
+    ):
+        super().__init__()
+
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.mode = mode
+
+        self.register_buffer("window", torch.hann_window(win_length))
+
+    def forward(self, y: Tensor) -> Tensor:
+        if y.ndim == 3:
+            y = y.squeeze(1)
+
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (
+                (self.win_length - self.hop_length) // 2,
+                (self.win_length - self.hop_length + 1) // 2,
+            ),
+            mode="reflect",
+        ).squeeze(1)
+        dtype = y.dtype
+        spec = torch.stft(
+            y.float(),
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=comfy.model_management.cast_to(self.window, dtype=torch.float32, device=y.device),
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+
+        if self.mode == "pow2_sqrt":
+            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+        spec = spec.to(dtype)
+        return spec
+
+
+class LogMelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        sample_rate=44100,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        n_mels=128,
+        center=False,
+        f_min=0.0,
+        f_max=None,
+    ):
+        super().__init__()
+
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max or sample_rate // 2
+
+        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
+        self.mel_scale = MelScale(
+            self.n_mels,
+            self.sample_rate,
+            self.f_min,
+            self.f_max,
+            self.n_fft // 2 + 1,
+            "slaney",
+            "slaney",
+        )
+
+    def compress(self, x: Tensor) -> Tensor:
+        return torch.log(torch.clamp(x, min=1e-5))
+
+    def decompress(self, x: Tensor) -> Tensor:
+        return torch.exp(x)
+
+    def forward(self, x: Tensor, return_linear: bool = False) -> Tensor:
+        linear = self.spectrogram(x)
+        x = self.mel_scale(linear)
+        x = self.compress(x)
+        # print(x.shape)
+        if return_linear:
+            return x, self.compress(linear)
+
+        return x
--- a/comfy/ldm/ace/vae/music_vocoder.py
+++ b/comfy/ldm/ace/vae/music_vocoder.py
@@ -0,0 +1,538 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_vocoder.py
+import torch
+from torch import nn
+
+from functools import partial
+from math import prod
+from typing import Callable, Tuple, List
+
+import numpy as np
+import torch.nn.functional as F
+from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
+
+from .music_log_mel import LogMelSpectrogram
+
+import comfy.model_management
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """  # noqa: E501
+
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+
+
+class LayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """  # noqa: E501
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(
+                x, self.normalized_shape, comfy.model_management.cast_to(self.weight, dtype=x.dtype, device=x.device), comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device), self.eps
+            )
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = comfy.model_management.cast_to(self.weight[:, None], dtype=x.dtype, device=x.device) * x + comfy.model_management.cast_to(self.bias[:, None], dtype=x.dtype, device=x.device)
+            return x
+
+
+class ConvNeXtBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
+        dilation (int): Dilation for depthwise conv. Default: 1.
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        dim: int,
+        drop_path: float = 0.0,
+        layer_scale_init_value: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        kernel_size: int = 7,
+        dilation: int = 1,
+    ):
+        super().__init__()
+
+        self.dwconv = ops.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=int(dilation * (kernel_size - 1) / 2),
+            groups=dim,
+        )  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = ops.Linear(
+            dim, int(mlp_ratio * dim)
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = ops.Linear(int(mlp_ratio * dim), dim)
+        self.gamma = (
+            nn.Parameter(torch.empty((dim)), requires_grad=False)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x, apply_residual: bool = True):
+        input = x
+
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+
+        if self.gamma is not None:
+            x = comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device) * x
+
+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
+        x = self.drop_path(x)
+
+        if apply_residual:
+            x = input + x
+
+        return x
+
+
+class ParallelConvNeXtBlock(nn.Module):
+    def __init__(self, kernel_sizes: List[int], *args, **kwargs):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
+                for kernel_size in kernel_sizes
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.stack(
+            [block(x, apply_residual=False) for block in self.blocks] + [x],
+            dim=1,
+        ).sum(dim=1)
+
+
+class ConvNeXtEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        kernel_sizes: Tuple[int] = (7,),
+    ):
+        super().__init__()
+        assert len(depths) == len(dims)
+
+        self.channel_layers = nn.ModuleList()
+        stem = nn.Sequential(
+            ops.Conv1d(
+                input_channels,
+                dims[0],
+                kernel_size=7,
+                padding=3,
+                padding_mode="replicate",
+            ),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
+        )
+        self.channel_layers.append(stem)
+
+        for i in range(len(depths) - 1):
+            mid_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                ops.Conv1d(dims[i], dims[i + 1], kernel_size=1),
+            )
+            self.channel_layers.append(mid_layer)
+
+        block_fn = (
+            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
+            if len(kernel_sizes) == 1
+            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
+        )
+
+        self.stages = nn.ModuleList()
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+
+        cur = 0
+        for i in range(len(depths)):
+            stage = nn.Sequential(
+                *[
+                    block_fn(
+                        dim=dims[i],
+                        drop_path=drop_path_rates[cur + j],
+                        layer_scale_init_value=layer_scale_init_value,
+                    )
+                    for j in range(depths[i])
+                ]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        for channel_layer, stage in zip(self.channel_layers, self.stages):
+            x = channel_layer(x)
+            x = stage(x)
+
+        return self.norm(x)
+
+
+def get_padding(kernel_size, dilation=1):
+    return (kernel_size * dilation - dilation) // 2
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+
+        self.convs1 = nn.ModuleList(
+            [
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+
+        self.convs2 = nn.ModuleList(
+            [
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.silu(x)
+            xt = c1(xt)
+            xt = F.silu(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for conv in self.convs1:
+            remove_weight_norm(conv)
+        for conv in self.convs2:
+            remove_weight_norm(conv)
+
+
+class HiFiGANGenerator(nn.Module):
+    def __init__(
+        self,
+        *,
+        hop_length: int = 512,
+        upsample_rates: Tuple[int] = (8, 8, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = (
+            (1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        num_mels: int = 128,
+        upsample_initial_channel: int = 512,
+        use_template: bool = True,
+        pre_conv_kernel_size: int = 7,
+        post_conv_kernel_size: int = 7,
+        post_activation: Callable = partial(nn.SiLU, inplace=True),
+    ):
+        super().__init__()
+
+        assert (
+            prod(upsample_rates) == hop_length
+        ), f"hop_length must be {prod(upsample_rates)}"
+
+        self.conv_pre = torch.nn.utils.parametrizations.weight_norm(
+            ops.Conv1d(
+                num_mels,
+                upsample_initial_channel,
+                pre_conv_kernel_size,
+                1,
+                padding=get_padding(pre_conv_kernel_size),
+            )
+        )
+
+        self.num_upsamples = len(upsample_rates)
+        self.num_kernels = len(resblock_kernel_sizes)
+
+        self.noise_convs = nn.ModuleList()
+        self.use_template = use_template
+        self.ups = nn.ModuleList()
+
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+            if not use_template:
+                continue
+
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(
+                    ops.Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(ops.Conv1d(1, c_cur, kernel_size=1))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
+                self.resblocks.append(ResBlock1(ch, k, d))
+
+        self.activation_post = post_activation()
+        self.conv_post = torch.nn.utils.parametrizations.weight_norm(
+            ops.Conv1d(
+                ch,
+                1,
+                post_conv_kernel_size,
+                1,
+                padding=get_padding(post_conv_kernel_size),
+            )
+        )
+
+    def forward(self, x, template=None):
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            x = F.silu(x, inplace=True)
+            x = self.ups[i](x)
+
+            if self.use_template:
+                x = x + self.noise_convs[i](template)
+
+            xs = None
+
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+
+            x = xs / self.num_kernels
+
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        for up in self.ups:
+            remove_weight_norm(up)
+        for block in self.resblocks:
+            block.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class ADaMoSHiFiGANV1(nn.Module):
+    def __init__(
+        self,
+        input_channels: int = 128,
+        depths: List[int] = [3, 3, 9, 3],
+        dims: List[int] = [128, 256, 384, 512],
+        drop_path_rate: float = 0.0,
+        kernel_sizes: Tuple[int] = (7,),
+        upsample_rates: Tuple[int] = (4, 4, 2, 2, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (8, 8, 4, 4, 4, 4, 4),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11, 13),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = (
+            (1, 3, 5), (1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        num_mels: int = 512,
+        upsample_initial_channel: int = 1024,
+        use_template: bool = False,
+        pre_conv_kernel_size: int = 13,
+        post_conv_kernel_size: int = 13,
+        sampling_rate: int = 44100,
+        n_fft: int = 2048,
+        win_length: int = 2048,
+        hop_length: int = 512,
+        f_min: int = 40,
+        f_max: int = 16000,
+        n_mels: int = 128,
+    ):
+        super().__init__()
+
+        self.backbone = ConvNeXtEncoder(
+            input_channels=input_channels,
+            depths=depths,
+            dims=dims,
+            drop_path_rate=drop_path_rate,
+            kernel_sizes=kernel_sizes,
+        )
+
+        self.head = HiFiGANGenerator(
+            hop_length=hop_length,
+            upsample_rates=upsample_rates,
+            upsample_kernel_sizes=upsample_kernel_sizes,
+            resblock_kernel_sizes=resblock_kernel_sizes,
+            resblock_dilation_sizes=resblock_dilation_sizes,
+            num_mels=num_mels,
+            upsample_initial_channel=upsample_initial_channel,
+            use_template=use_template,
+            pre_conv_kernel_size=pre_conv_kernel_size,
+            post_conv_kernel_size=post_conv_kernel_size,
+        )
+        self.sampling_rate = sampling_rate
+        self.mel_transform = LogMelSpectrogram(
+            sample_rate=sampling_rate,
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_length,
+            f_min=f_min,
+            f_max=f_max,
+            n_mels=n_mels,
+        )
+        self.eval()
+
+    @torch.no_grad()
+    def decode(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
+
+    @torch.no_grad()
+    def encode(self, x):
+        return self.mel_transform(x)
+
+    def forward(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
--- a/comfy/ldm/audio/autoencoder.py
+++ b/comfy/ldm/audio/autoencoder.py
@@ -0,0 +1,276 @@
+# code adapted from: https://github.com/Stability-AI/stable-audio-tools
+
+import torch
+from torch import nn
+from typing import Literal
+import math
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+def vae_sample(mean, scale):
+        stdev = nn.functional.softplus(scale) + 1e-4
+        var = stdev * stdev
+        logvar = torch.log(var)
+        latents = torch.randn_like(mean) * stdev + mean
+
+        kl = (mean * mean + var - logvar - 1).sum(1).mean()
+
+        return latents, kl
+
+class VAEBottleneck(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.is_discrete = False
+
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+
+        mean, scale = x.chunk(2, dim=1)
+
+        x, kl = vae_sample(mean, scale)
+
+        info["kl"] = kl
+
+        if return_info:
+            return x, info
+        else:
+            return x
+
+    def decode(self, x):
+        return x
+
+
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta + 0.000000001)) * pow(torch.sin(x * alpha), 2)
+
+# Adapted from https://github.com/NVIDIA/BigVGAN/blob/main/activations.py under MIT license
+class SnakeBeta(nn.Module):
+
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+
+        # self.alpha.requires_grad = alpha_trainable
+        # self.beta.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1).to(x.device) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1).to(x.device)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = snake_beta(x, alpha, beta)
+
+        return x
+
+def WNConv1d(*args, **kwargs):
+    return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
+
+def WNConvTranspose1d(*args, **kwargs):
+    return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
+
+def get_activation(activation: Literal["elu", "snake", "none"], antialias=False, channels=None) -> nn.Module:
+    if activation == "elu":
+        act = torch.nn.ELU()
+    elif activation == "snake":
+        act = SnakeBeta(channels)
+    elif activation == "none":
+        act = torch.nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+
+    if antialias:
+        act = Activation1d(act)  # noqa: F821 Activation1d is not defined
+
+    return act
+
+
+class ResidualUnit(nn.Module):
+    def __init__(self, in_channels, out_channels, dilation, use_snake=False, antialias_activation=False):
+        super().__init__()
+
+        self.dilation = dilation
+
+        padding = (dilation * (7-1)) // 2
+
+        self.layers = nn.Sequential(
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
+            WNConv1d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=7, dilation=dilation, padding=padding),
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
+            WNConv1d(in_channels=out_channels, out_channels=out_channels,
+                      kernel_size=1)
+        )
+
+    def forward(self, x):
+        res = x
+
+        #x = checkpoint(self.layers, x)
+        x = self.layers(x)
+
+        return x + res
+
+class EncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, use_snake=False, antialias_activation=False):
+        super().__init__()
+
+        self.layers = nn.Sequential(
+            ResidualUnit(in_channels=in_channels,
+                         out_channels=in_channels, dilation=1, use_snake=use_snake),
+            ResidualUnit(in_channels=in_channels,
+                         out_channels=in_channels, dilation=3, use_snake=use_snake),
+            ResidualUnit(in_channels=in_channels,
+                         out_channels=in_channels, dilation=9, use_snake=use_snake),
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels),
+            WNConv1d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=2*stride, stride=stride, padding=math.ceil(stride/2)),
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+class DecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, use_snake=False, antialias_activation=False, use_nearest_upsample=False):
+        super().__init__()
+
+        if use_nearest_upsample:
+            upsample_layer = nn.Sequential(
+                nn.Upsample(scale_factor=stride, mode="nearest"),
+                WNConv1d(in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=2*stride,
+                        stride=1,
+                        bias=False,
+                        padding='same')
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(in_channels=in_channels,
+                               out_channels=out_channels,
+                               kernel_size=2*stride, stride=stride, padding=math.ceil(stride/2))
+
+        self.layers = nn.Sequential(
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels),
+            upsample_layer,
+            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
+                         dilation=1, use_snake=use_snake),
+            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
+                         dilation=3, use_snake=use_snake),
+            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
+                         dilation=9, use_snake=use_snake),
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+class OobleckEncoder(nn.Module):
+    def __init__(self,
+                 in_channels=2,
+                 channels=128,
+                 latent_dim=32,
+                 c_mults = [1, 2, 4, 8],
+                 strides = [2, 4, 8, 8],
+                 use_snake=False,
+                 antialias_activation=False
+        ):
+        super().__init__()
+
+        c_mults = [1] + c_mults
+
+        self.depth = len(c_mults)
+
+        layers = [
+            WNConv1d(in_channels=in_channels, out_channels=c_mults[0] * channels, kernel_size=7, padding=3)
+        ]
+
+        for i in range(self.depth-1):
+            layers += [EncoderBlock(in_channels=c_mults[i]*channels, out_channels=c_mults[i+1]*channels, stride=strides[i], use_snake=use_snake)]
+
+        layers += [
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[-1] * channels),
+            WNConv1d(in_channels=c_mults[-1]*channels, out_channels=latent_dim, kernel_size=3, padding=1)
+        ]
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class OobleckDecoder(nn.Module):
+    def __init__(self,
+                 out_channels=2,
+                 channels=128,
+                 latent_dim=32,
+                 c_mults = [1, 2, 4, 8],
+                 strides = [2, 4, 8, 8],
+                 use_snake=False,
+                 antialias_activation=False,
+                 use_nearest_upsample=False,
+                 final_tanh=True):
+        super().__init__()
+
+        c_mults = [1] + c_mults
+
+        self.depth = len(c_mults)
+
+        layers = [
+            WNConv1d(in_channels=latent_dim, out_channels=c_mults[-1]*channels, kernel_size=7, padding=3),
+        ]
+
+        for i in range(self.depth-1, 0, -1):
+            layers += [DecoderBlock(
+                in_channels=c_mults[i]*channels,
+                out_channels=c_mults[i-1]*channels,
+                stride=strides[i-1],
+                use_snake=use_snake,
+                antialias_activation=antialias_activation,
+                use_nearest_upsample=use_nearest_upsample
+                )
+            ]
+
+        layers += [
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[0] * channels),
+            WNConv1d(in_channels=c_mults[0] * channels, out_channels=out_channels, kernel_size=7, padding=3, bias=False),
+            nn.Tanh() if final_tanh else nn.Identity()
+        ]
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class AudioOobleckVAE(nn.Module):
+    def __init__(self,
+                 in_channels=2,
+                 channels=128,
+                 latent_dim=64,
+                 c_mults = [1, 2, 4, 8, 16],
+                 strides = [2, 4, 4, 8, 8],
+                 use_snake=True,
+                 antialias_activation=False,
+                 use_nearest_upsample=False,
+                 final_tanh=False):
+        super().__init__()
+        self.encoder = OobleckEncoder(in_channels, channels, latent_dim * 2, c_mults, strides, use_snake, antialias_activation)
+        self.decoder = OobleckDecoder(in_channels, channels, latent_dim, c_mults, strides, use_snake, antialias_activation,
+                                      use_nearest_upsample=use_nearest_upsample, final_tanh=final_tanh)
+        self.bottleneck = VAEBottleneck()
+
+    def encode(self, x):
+        return self.bottleneck.encode(self.encoder(x))
+
+    def decode(self, x):
+        return self.decoder(self.bottleneck.decode(x))
+
--- a/Show More
+++ b/Show More