Add pre-commit configuration and update README for backend development

2025-03-02 12:43:19 -08:00
60 changed files with 426 additions and 1593 deletions
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@@ -1,2 +0,0 @@
 .\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
 pause
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -22,7 +22,7 @@ on:
        description: 'Python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"
 jobs:
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "126"
      python_minor:
        description: 'python minor version'
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "2"
+        default: "1"
 #  push:
 #    branches:
 #      - master
@@ -34,7 +34,7 @@ jobs:
    steps:
        - uses: actions/checkout@v4
          with:
-            fetch-depth: 30
+            fetch-depth: 0
            persist-credentials: false
        - uses: actions/setup-python@v5
          with:
@@ -74,7 +74,7 @@ jobs:
            pause" > ./update/update_comfyui_and_python_dependencies.bat
            cd ..
-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
            mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z
            cd ComfyUI_windows_portable_nightly_pytorch
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
 repos:
  - repo: https://github.com/charliermarsh/ruff-pre-commit
    rev: v0.0.241  # Use the desired version of Ruff
    hooks:
      - id: ruff
  - repo: local
    hooks:
      - id: pytest
        name: Run Pytest
        entry: pytest
        language: system
        types: [python] 
--- a/5
+++ b/5
@@ -19,6 +19,5 @@
 /app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
 /utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
-# Node developers
+# Extra nodes
-/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered
+/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink
 /comfy/comfy_types/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered
--- a/README.md
+++ b/README.md
@@ -215,9 +215,9 @@ Nvidia users should install stable pytorch using this command:
 ```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126```
-This is the command to install pytorch nightly instead which supports the new blackwell 50xx series GPUs and might have performance improvements.
+This is the command to install pytorch nightly instead which might have performance improvements:
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126```
 #### Troubleshooting
@@ -330,6 +330,25 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w
 See also: [https://www.comfy.org/](https://www.comfy.org/)
 ## ComfyUI Backend Development
 ### Setup Environment
 Install pre-commit to run tests and linters
 ```
 pip install pre-commit
 ```
 ```
 pre-commit install
 ```
 ### Reporting Issues and Requesting Features
 For any bugs, issues, or feature requests related to the backend, please use the [ComfyUI repository](https://github.com/comfyanonymous/ComfyUI). This will help us manage and address backend-specific concerns more efficiently.
 ## Frontend Development
 As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -3,7 +3,6 @@ import argparse
 import logging
 import os
 import re
 import sys
 import tempfile
 import zipfile
 import importlib
@@ -11,43 +10,19 @@ from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import TypedDict, Optional
 from importlib.metadata import version
 import requests
 from typing_extensions import NotRequired
 from comfy.cli_args import DEFAULT_VERSION_STRING
 import app.logger
 # The path to the requirements.txt file
 req_path = Path(__file__).parents[1] / "requirements.txt"
 def frontend_install_warning_message():
    """The warning message to display when the frontend version is not up to date."""
    extra = ""
    if sys.flags.no_user_site:
        extra = "-s "
    return f"Please install the updated requirements.txt file by running:\n{sys.executable} {extra}-m pip install -r {req_path}\n\nThis error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.\n\nIf you are on the portable package you can run: update\\update_comfyui.bat to solve this problem"
-def check_frontend_version():
+try:
-    """Check if the frontend version is up to date."""
+    import comfyui_frontend_package
-
+except ImportError as e:
-    def parse_version(version: str) -> tuple[int, int, int]:
+    # TODO: Remove the check after roll out of 0.3.16
-        return tuple(map(int, version.split(".")))
+    logging.error("comfyui-frontend-package is not installed. Please install the updated requirements.txt file by running: pip install -r requirements.txt")
-
+    raise e
    try:
        frontend_version_str = version("comfyui-frontend-package")
        frontend_version = parse_version(frontend_version_str)
        with open(req_path, "r", encoding="utf-8") as f:
            required_frontend = parse_version(f.readline().split("=")[-1])
        if frontend_version < required_frontend:
            app.logger.log_startup_warning("________________________________________________________________________\nWARNING WARNING WARNING WARNING WARNING\n\nInstalled frontend version {} is lower than the recommended version {}.\n\n{}\n________________________________________________________________________".format('.'.join(map(str, frontend_version)), '.'.join(map(str, required_frontend)), frontend_install_warning_message()))
        else:
            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
    except Exception as e:
        logging.error(f"Failed to check frontend version: {e}")
 REQUEST_TIMEOUT = 10  # seconds
@@ -144,17 +119,9 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:
 class FrontendManager:
    DEFAULT_FRONTEND_PATH = str(importlib.resources.files(comfyui_frontend_package) / "static")
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")
    @classmethod
    def default_frontend_path(cls) -> str:
        try:
            import comfyui_frontend_package
            return str(importlib.resources.files(comfyui_frontend_package) / "static")
        except ImportError:
            logging.error(f"\n\n********** ERROR ***********\n\ncomfyui-frontend-package is not installed. {frontend_install_warning_message()}\n********** ERROR **********\n")
            sys.exit(-1)
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
@@ -191,8 +158,7 @@ class FrontendManager:
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
-            check_frontend_version()
+            return cls.DEFAULT_FRONTEND_PATH
            return cls.default_frontend_path()
        repo_owner, repo_name, version = cls.parse_version_string(version_string)
@@ -245,5 +211,4 @@ class FrontendManager:
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
-            check_frontend_version()
+            return cls.DEFAULT_FRONTEND_PATH
            return cls.default_frontend_path()
--- a/app/logger.py
+++ b/app/logger.py
@@ -82,17 +82,3 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
        logger.addHandler(stdout_handler)
    logger.addHandler(stream_handler)
 STARTUP_WARNINGS = []
 def log_startup_warning(msg):
    logging.warning(msg)
    STARTUP_WARNINGS.append(msg)
 def print_startup_warnings():
    for s in STARTUP_WARNINGS:
        logging.warning(s)
    STARTUP_WARNINGS.clear()
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,6 +1,7 @@
 import argparse
 import enum
 import os
 from typing import Optional
 import comfy.options
@@ -165,14 +166,13 @@ parser.add_argument(
    """,
 )
-def is_valid_directory(path: str) -> str:
+def is_valid_directory(path: Optional[str]) -> Optional[str]:
-    """Validate if the given path is a directory, and check permissions."""
+    """Validate if the given path is a directory."""
-    if not os.path.exists(path):
+    if path is None:
-        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
+        return None
    if not os.path.isdir(path):
-        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
+        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
    if not os.access(path, os.R_OK):
        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
    return path
 parser.add_argument(
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -97,12 +97,8 @@ class CLIPTextModel_(torch.nn.Module):
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
-        if embeds is not None:
+        x = self.embeddings(input_tokens, dtype=dtype)
            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
        else:
            x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -120,10 +116,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)
-        if num_tokens is not None:
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
        else:
            pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output
 class CLIPTextModel(torch.nn.Module):
@@ -211,15 +204,6 @@ class CLIPVision(torch.nn.Module):
            pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output
 class LlavaProjector(torch.nn.Module):
    def __init__(self, in_dim, out_dim, dtype, device, operations):
        super().__init__()
        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
    def forward(self, x):
        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
@@ -229,16 +213,7 @@ class CLIPVisionModelProjection(torch.nn.Module):
        else:
            self.visual_projection = lambda a: a
        if "llava3" == config_dict.get("projector_type", None):
            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
        else:
            self.multi_modal_projector = None
    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
        out = self.visual_projection(x[2])
-        projected = None
+        return (x[0], x[1], out)
        if self.multi_modal_projector is not None:
            projected = self.multi_modal_projector(x[1])
        return (x[0], x[1], out, projected)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -65,7 +65,6 @@ class ClipVisionModel():
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
        outputs["mm_projected"] = out[3]
        return outputs
 def convert_to_transformers(sd, prefix):
@@ -105,10 +104,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
-            if "multi_modal_projector.linear_1.bias" in sd:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
            else:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
    else:
--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@@ -1,19 +0,0 @@
 {
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 1024,
  "image_size": 336,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-5,
  "model_type": "clip_vision_model",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 14,
  "projection_dim": 768,
  "projector_type": "llava3",
  "torch_dtype": "float32"
 }
--- a/comfy/comfy_types/init.py
+++ b/comfy/comfy_types/init.py
@@ -1,6 +1,6 @@
 import torch
 from typing import Callable, Protocol, TypedDict, Optional, List
-from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin, FileLocator
+from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin
 class UnetApplyFunction(Protocol):
@@ -42,5 +42,4 @@ __all__ = [
    InputTypeDict.__name__,
    ComfyNodeABC.__name__,
    CheckLazyMixin.__name__,
    FileLocator.__name__,
 ]
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 from typing import Literal, TypedDict
 from typing_extensions import NotRequired
 from abc import ABC, abstractmethod
 from enum import Enum
@@ -27,7 +26,6 @@ class IO(StrEnum):
    BOOLEAN = "BOOLEAN"
    INT = "INT"
    FLOAT = "FLOAT"
    COMBO = "COMBO"
    CONDITIONING = "CONDITIONING"
    SAMPLER = "SAMPLER"
    SIGMAS = "SIGMAS"
@@ -68,7 +66,6 @@ class IO(StrEnum):
        b = frozenset(value.split(","))
        return not (b.issubset(a) or a.issubset(b))
 class RemoteInputOptions(TypedDict):
    route: str
    """The route to the remote source."""
@@ -83,14 +80,6 @@ class RemoteInputOptions(TypedDict):
    refresh: int
    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
 class MultiSelectOptions(TypedDict):
    placeholder: NotRequired[str]
    """The placeholder text to display in the multi-select widget when no items are selected."""
    chip: NotRequired[bool]
    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
 class InputTypeOptions(TypedDict):
    """Provides type hinting for the return type of the INPUT_TYPES node function.
@@ -125,7 +114,7 @@ class InputTypeOptions(TypedDict):
    # default: bool
    label_on: str
    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
-    label_off: str
+    label_on: str
    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
    # class InputTypeString(InputTypeOptions):
    # default: str
@@ -144,22 +133,7 @@ class InputTypeOptions(TypedDict):
    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
    """
    remote: RemoteInputOptions
-    """Specifies the configuration for a remote input.
+    """Specifies the configuration for a remote input."""
    Available after ComfyUI frontend v1.9.7
    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
    control_after_generate: bool
    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
    options: NotRequired[list[str | int | float]]
    """COMBO type only. Specifies the selectable options for the combo widget.
    Prefer:
    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
    Over:
    [["Option 1", "Option 2", "Option 3"]]
    """
    multi_select: NotRequired[MultiSelectOptions]
    """COMBO type only. Specifies the configuration for a multi-select widget.
    Available after ComfyUI frontend v1.13.4
    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""
 class HiddenInputTypeDict(TypedDict):
@@ -319,14 +293,3 @@ class CheckLazyMixin:
        need = [name for name in kwargs if kwargs[name] is None]
        return need
 class FileLocator(TypedDict):
    """Provides type hinting for the file location"""
    filename: str
    """The filename of the file."""
    subfolder: str
    """The subfolder of the file."""
    type: Literal["input", "output", "temp"]
    """The root folder of the file."""
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -688,10 +688,10 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
    if len(sigmas) <= 1:
        return x
    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    seed = extra_args.get("seed", None)
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@@ -762,10 +762,10 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if solver_type not in {'heun', 'midpoint'}:
        raise ValueError('solver_type must be \'heun\' or \'midpoint\'')
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    old_denoised = None
@@ -808,10 +808,10 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if len(sigmas) <= 1:
        return x
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    denoised_1, denoised_2 = None, None
@@ -858,7 +858,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
 def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
@@ -867,7 +867,7 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
@@ -876,7 +876,7 @@ def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
@@ -1366,59 +1366,3 @@ def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None,
            x = x + d_bar * dt
        old_d = d
    return x
@torch.no_grad()
 def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, noise_scaler=None, max_stage=3):
    """
    Extended Reverse-Time SDE solver (VE ER-SDE-Solver-3). Arxiv: https://arxiv.org/abs/2309.06169.
    Code reference: https://github.com/QinpengCui/ER-SDE-Solver/blob/main/er_sde_solver.py.
    """
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    def default_noise_scaler(sigma):
        return sigma * ((sigma ** 0.3).exp() + 10.0)
    noise_scaler = default_noise_scaler if noise_scaler is None else noise_scaler
    num_integration_points = 200.0
    point_indice = torch.arange(0, num_integration_points, dtype=torch.float32, device=x.device)
    old_denoised = None
    old_denoised_d = None
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        stage_used = min(max_stage, i + 1)
        if sigmas[i + 1] == 0:
            x = denoised
        elif stage_used == 1:
            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
            x = r * x + (1 - r) * denoised
        else:
            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
            x = r * x + (1 - r) * denoised
            dt = sigmas[i + 1] - sigmas[i]
            sigma_step_size = -dt / num_integration_points
            sigma_pos = sigmas[i + 1] + point_indice * sigma_step_size
            scaled_pos = noise_scaler(sigma_pos)
            # Stage 2
            s = torch.sum(1 / scaled_pos) * sigma_step_size
            denoised_d = (denoised - old_denoised) / (sigmas[i] - sigmas[i - 1])
            x = x + (dt + s * noise_scaler(sigmas[i + 1])) * denoised_d
            if stage_used >= 3:
                # Stage 3
                s_u = torch.sum((sigma_pos - sigmas[i]) / scaled_pos) * sigma_step_size
                denoised_u = (denoised_d - old_denoised_d) / ((sigmas[i] - sigmas[i - 2]) / 2)
                x = x + ((dt ** 2) / 2 + s_u * noise_scaler(sigmas[i + 1])) * denoised_u
            old_denoised_d = denoised_d
        if s_noise != 0 and sigmas[i + 1] > 0:
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (sigmas[i + 1] ** 2 - sigmas[i] ** 2 * r ** 2).sqrt()
        old_denoised = denoised
    return x
--- a/comfy/ldm/cascade/stage_a.py
+++ b/comfy/ldm/cascade/stage_a.py
@@ -19,10 +19,6 @@
 import torch
 from torch import nn
 from torch.autograd import Function
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 class vector_quantize(Function):
    @staticmethod
@@ -125,15 +121,15 @@ class ResBlock(nn.Module):
        self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.depthwise = nn.Sequential(
            nn.ReplicationPad2d(1),
-            ops.Conv2d(c, c, kernel_size=3, groups=c)
+            nn.Conv2d(c, c, kernel_size=3, groups=c)
        )
        # channelwise
        self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.channelwise = nn.Sequential(
-            ops.Linear(c, c_hidden),
+            nn.Linear(c, c_hidden),
            nn.GELU(),
-            ops.Linear(c_hidden, c),
+            nn.Linear(c_hidden, c),
        )
        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
@@ -175,16 +171,16 @@ class StageA(nn.Module):
        # Encoder blocks
        self.in_block = nn.Sequential(
            nn.PixelUnshuffle(2),
-            ops.Conv2d(3 * 4, c_levels[0], kernel_size=1)
+            nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
        )
        down_blocks = []
        for i in range(levels):
            if i > 0:
-                down_blocks.append(ops.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
            block = ResBlock(c_levels[i], c_levels[i] * 4)
            down_blocks.append(block)
        down_blocks.append(nn.Sequential(
-            ops.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
        ))
        self.down_blocks = nn.Sequential(*down_blocks)
@@ -195,7 +191,7 @@ class StageA(nn.Module):
        # Decoder blocks
        up_blocks = [nn.Sequential(
-            ops.Conv2d(c_latent, c_levels[-1], kernel_size=1)
+            nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
        )]
        for i in range(levels):
            for j in range(bottleneck_blocks if i == 0 else 1):
@@ -203,11 +199,11 @@ class StageA(nn.Module):
                up_blocks.append(block)
            if i < levels - 1:
                up_blocks.append(
-                    ops.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
+                    nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
                                       padding=1))
        self.up_blocks = nn.Sequential(*up_blocks)
        self.out_block = nn.Sequential(
-            ops.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
+            nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
            nn.PixelShuffle(2),
        )
@@ -236,17 +232,17 @@ class Discriminator(nn.Module):
        super().__init__()
        d = max(depth - 3, 3)
        layers = [
-            nn.utils.spectral_norm(ops.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
+            nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
        ]
        for i in range(depth - 1):
            c_in = c_hidden // (2 ** max((d - i), 0))
            c_out = c_hidden // (2 ** max((d - 1 - i), 0))
-            layers.append(nn.utils.spectral_norm(ops.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
            layers.append(nn.InstanceNorm2d(c_out))
            layers.append(nn.LeakyReLU(0.2))
        self.encoder = nn.Sequential(*layers)
-        self.shuffle = ops.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
+        self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
        self.logits = nn.Sigmoid()
    def forward(self, x, cond=None):
--- a/comfy/ldm/cascade/stage_c_coder.py
+++ b/comfy/ldm/cascade/stage_c_coder.py
@@ -19,9 +19,6 @@ import torch
 import torchvision
 from torch import nn
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 # EfficientNet
 class EfficientNetEncoder(nn.Module):
@@ -29,7 +26,7 @@ class EfficientNetEncoder(nn.Module):
        super().__init__()
        self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
        self.mapper = nn.Sequential(
-            ops.Conv2d(1280, c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(1280, c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent, affine=False),  # then normalize them to have mean 0 and std 1
        )
        self.mean = nn.Parameter(torch.tensor([0.485, 0.456, 0.406]))
@@ -37,7 +34,7 @@ class EfficientNetEncoder(nn.Module):
    def forward(self, x):
        x = x * 0.5 + 0.5
-        x = (x - self.mean.view([3,1,1]).to(device=x.device, dtype=x.dtype)) / self.std.view([3,1,1]).to(device=x.device, dtype=x.dtype)
+        x = (x - self.mean.view([3,1,1])) / self.std.view([3,1,1])
        o = self.mapper(self.backbone(x))
        return o
@@ -47,39 +44,39 @@ class Previewer(nn.Module):
    def __init__(self, c_in=16, c_hidden=512, c_out=3):
        super().__init__()
        self.blocks = nn.Sequential(
-            ops.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
+            nn.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),
-            ops.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),
-            ops.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
+            nn.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),
-            ops.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),
-            ops.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
+            nn.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
+            nn.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),
-            ops.Conv2d(c_hidden // 4, c_out, kernel_size=1),
+            nn.Conv2d(c_hidden // 4, c_out, kernel_size=1),
        )
    def forward(self, x):
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -105,9 +105,7 @@ class Modulation(nn.Module):
        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
    def forward(self, vec: Tensor) -> tuple:
-        if vec.ndim == 2:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
            vec = vec[:, None, :]
        out = self.lin(nn.functional.silu(vec)).chunk(self.multiplier, dim=-1)
        return (
            ModulationOut(*out[:3]),
@@ -115,20 +113,6 @@ class Modulation(nn.Module):
        )
 def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
    if modulation_dims is None:
        if m_add is not None:
            return tensor * m_mult + m_add
        else:
            return tensor * m_mult
    else:
        for d in modulation_dims:
            tensor[:, d[0]:d[1]] *= m_mult[:, d[2]]
            if m_add is not None:
                tensor[:, d[0]:d[1]] += m_add[:, d[2]]
        return tensor
 class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()
@@ -159,20 +143,20 @@ class DoubleStreamBlock(nn.Module):
        )
        self.flipped_img_txt = flipped_img_txt
-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)
        # prepare image for attention
        img_modulated = self.img_norm1(img)
-        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
@@ -195,12 +179,12 @@ class DoubleStreamBlock(nn.Module):
            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
        # calculate the img bloks
-        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
        # calculate the txt bloks
-        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
+        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
+        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
@@ -244,9 +228,9 @@ class SingleStreamBlock(nn.Module):
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None) -> Tensor:
        mod, _ = self.modulation(vec)
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        qkv, mlp = torch.split(self.linear1((1 + mod.scale) * self.pre_norm(x) + mod.shift), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)
@@ -255,7 +239,7 @@ class SingleStreamBlock(nn.Module):
        attn = attention(q, k, v, pe=pe, mask=attn_mask)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += apply_mod(output, mod.gate, None, modulation_dims)
+        x += mod.gate * output
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
@@ -268,11 +252,8 @@ class LastLayer(nn.Module):
        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
-    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        if vec.ndim == 2:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-            vec = vec[:, None, :]
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=-1)
        x = apply_mod(self.norm_final(x), (1 + scale), shift, modulation_dims)
        x = self.linear(x)
        return x
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -227,7 +227,6 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
        guiding_frame_index=None,
        control=None,
        transformer_options={},
    ) -> Tensor:
@@ -238,17 +237,7 @@ class HunyuanVideo(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))
-        if guiding_frame_index is not None:
+        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
            token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0))
            vec_ = self.vector_in(y[:, :self.params.vec_in_dim])
            vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1)
            frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2])
            modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)]
            modulation_dims_txt = [(0, None, 1)]
        else:
            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
            modulation_dims = None
            modulation_dims_txt = None
        if self.params.guidance_embed:
            if guidance is not None:
@@ -275,14 +264,14 @@ class HunyuanVideo(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out
-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask)
            if control is not None: # Controlnet
                control_i = control.get("input")
@@ -297,13 +286,13 @@ class HunyuanVideo(nn.Module):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"])
                    return out
-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask}, {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
            if control is not None: # Controlnet
                control_o = control.get("output")
@@ -314,7 +303,7 @@ class HunyuanVideo(nn.Module):
        img = img[:, : img_len]
-        img = self.final_layer(img, vec, modulation_dims=modulation_dims)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        shape = initial_shape[-3:]
        for i in range(len(shape)):
@@ -324,7 +313,7 @@ class HunyuanVideo(nn.Module):
        img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4])
        return img
-    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, control=None, transformer_options={}, **kwargs):
        bs, c, t, h, w = x.shape
        patch_size = self.patch_size
        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
@@ -336,5 +325,5 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, control, transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, control, transformer_options)
        return out
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@@ -7,7 +7,7 @@ from einops import rearrange
 import math
 from typing import Dict, Optional, Tuple
-from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
+from .symmetric_patchifier import SymmetricPatchifier
 def get_timestep_embedding(
@@ -377,16 +377,12 @@ class LTXVModel(torch.nn.Module):
                 positional_embedding_theta=10000.0,
                 positional_embedding_max_pos=[20, 2048, 2048],
                 causal_temporal_positioning=False,
                 vae_scale_factors=(8, 32, 32),
                 dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.generator = None
        self.vae_scale_factors = vae_scale_factors
        self.dtype = dtype
        self.out_channels = in_channels
        self.inner_dim = num_attention_heads * attention_head_dim
        self.causal_temporal_positioning = causal_temporal_positioning
        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
@@ -420,23 +416,42 @@ class LTXVModel(torch.nn.Module):
        self.patchifier = SymmetricPatchifier(1)
-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, guiding_latent_noise_scale=0, transformer_options={}, **kwargs):
        patches_replace = transformer_options.get("patches_replace", {})
        indices_grid = self.patchifier.get_grid(
            orig_num_frames=x.shape[2],
            orig_height=x.shape[3],
            orig_width=x.shape[4],
            batch_size=x.shape[0],
            scale_grid=((1 / frame_rate) * 8, 32, 32),
            device=x.device,
        )
        if guiding_latent is not None:
            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
            ts *= input_ts
            ts[:, :, 0] = guiding_latent_noise_scale * (input_ts[:, :, 0] ** 2)
            timestep = self.patchifier.patchify(ts)
            input_x = x.clone()
            x[:, :, 0] = guiding_latent[:, :, 0]
            if guiding_latent_noise_scale > 0:
                if self.generator is None:
                    self.generator = torch.Generator(device=x.device).manual_seed(42)
                elif self.generator.device != x.device:
                    self.generator = torch.Generator(device=x.device).set_state(self.generator.get_state())
                noise_shape = [guiding_latent.shape[0], guiding_latent.shape[1], 1, guiding_latent.shape[3], guiding_latent.shape[4]]
                scale = guiding_latent_noise_scale * (input_ts ** 2)
                guiding_noise = scale * torch.randn(size=noise_shape, device=x.device, generator=self.generator)
                x[:, :, 0] = guiding_noise[:, :, 0] + x[:, :, 0] *  (1.0 - scale[:, :, 0])
        orig_shape = list(x.shape)
-        x, latent_coords = self.patchifier.patchify(x)
+        x = self.patchifier.patchify(x)
        pixel_coords = latent_to_pixel_coords(
            latent_coords=latent_coords,
            scale_factors=self.vae_scale_factors,
            causal_fix=self.causal_temporal_positioning,
        )
        if keyframe_idxs is not None:
            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
        fractional_coords = pixel_coords.to(torch.float32)
        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
        x = self.patchify_proj(x)
        timestep = timestep * 1000.0
@@ -444,7 +459,7 @@ class LTXVModel(torch.nn.Module):
        if attention_mask is not None and not torch.is_floating_point(attention_mask):
            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max
-        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
+        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)
        batch_size = x.shape[0]
        timestep, embedded_timestep = self.adaln_single(
@@ -504,4 +519,8 @@ class LTXVModel(torch.nn.Module):
            out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
        )
        if guiding_latent is not None:
            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
        # print("res", x)
        return x
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@@ -6,29 +6,16 @@ from einops import rearrange
 from torch import Tensor
-def latent_to_pixel_coords(
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
-    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
-) -> Tensor:
+    dims_to_append = target_dims - x.ndim
-    """
+    if dims_to_append < 0:
-    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
+        raise ValueError(
-    configuration.
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
-    Args:
+        )
-        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
+    elif dims_to_append == 0:
-        containing the latent corner coordinates of each token.
+        return x
-        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
+    return x[(...,) + (None,) * dims_to_append]
        causal_fix (bool): Whether to take into account the different temporal scale
            of the first frame. Default = False for backwards compatibility.
    Returns:
        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
    """
    pixel_coords = (
        latent_coords
        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
    )
    if causal_fix:
        # Fix temporal scale for first frame to 1 due to causality
        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
    return pixel_coords
 class Patchifier(ABC):
@@ -57,26 +44,29 @@ class Patchifier(ABC):
    def patch_size(self):
        return self._patch_size
-    def get_latent_coords(
+    def get_grid(
-        self, latent_num_frames, latent_height, latent_width, batch_size, device
+        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
    ):
-        """
+        f = orig_num_frames // self._patch_size[0]
-        Return a tensor of shape [batch_size, 3, num_patches] containing the
+        h = orig_height // self._patch_size[1]
-            top-left corner latent coordinates of each latent patch.
+        w = orig_width // self._patch_size[2]
-        The tensor is repeated for each batch element.
+        grid_h = torch.arange(h, dtype=torch.float32, device=device)
-        """
+        grid_w = torch.arange(w, dtype=torch.float32, device=device)
-        latent_sample_coords = torch.meshgrid(
+        grid_f = torch.arange(f, dtype=torch.float32, device=device)
-            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+        grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing='ij')
-            torch.arange(0, latent_height, self._patch_size[1], device=device),
+        grid = torch.stack(grid, dim=0)
-            torch.arange(0, latent_width, self._patch_size[2], device=device),
+        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-            indexing="ij",
+
-        )
+        if scale_grid is not None:
-        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+            for i in range(3):
-        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+                if isinstance(scale_grid[i], Tensor):
-        latent_coords = rearrange(
+                    scale = append_dims(scale_grid[i], grid.ndim - 1)
-            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+                else:
-        )
+                    scale = scale_grid[i]
-        return latent_coords
+                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
        return grid
 class SymmetricPatchifier(Patchifier):
@@ -84,8 +74,6 @@ class SymmetricPatchifier(Patchifier):
        self,
        latents: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        b, _, f, h, w = latents.shape
        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
        latents = rearrange(
            latents,
            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
@@ -93,7 +81,7 @@ class SymmetricPatchifier(Patchifier):
            p2=self._patch_size[1],
            p3=self._patch_size[2],
        )
-        return latents, latent_coords
+        return latents
    def unpatchify(
        self,
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@@ -15,7 +15,6 @@ class CausalConv3d(nn.Module):
        stride: Union[int, Tuple[int]] = 1,
        dilation: int = 1,
        groups: int = 1,
        spatial_padding_mode: str = "zeros",
        **kwargs,
    ):
        super().__init__()
@@ -39,7 +38,7 @@ class CausalConv3d(nn.Module):
            stride=stride,
            dilation=dilation,
            padding=padding,
-            padding_mode=spatial_padding_mode,
+            padding_mode="zeros",
            groups=groups,
        )
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -1,15 +1,13 @@
 from __future__ import annotations
 import torch
 from torch import nn
 from functools import partial
 import math
 from einops import rearrange
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
 ops = comfy.ops.disable_weight_init
 class Encoder(nn.Module):
@@ -34,7 +32,7 @@ class Encoder(nn.Module):
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        latent_log_var (`str`, *optional*, defaults to `per_channel`):
-            The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
    """
    def __init__(
@@ -42,13 +40,12 @@ class Encoder(nn.Module):
        dims: Union[int, Tuple[int, int]] = 3,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        norm_num_groups: int = 32,
        patch_size: Union[int, Tuple[int]] = 1,
        norm_layer: str = "group_norm",  # group_norm, pixel_norm
        latent_log_var: str = "per_channel",
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -68,7 +65,6 @@ class Encoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.down_blocks = nn.ModuleList([])
@@ -86,7 +82,6 @@ class Encoder(nn.Module):
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -97,7 +92,6 @@ class Encoder(nn.Module):
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = make_conv_nd(
@@ -107,7 +101,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 1, 1),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = make_conv_nd(
@@ -117,7 +110,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(1, 2, 2),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                block = make_conv_nd(
@@ -127,7 +119,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -138,34 +129,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_res":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = SpaceToDepthDownsample(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    stride=(2, 2, 2),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space_res":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = SpaceToDepthDownsample(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    stride=(1, 2, 2),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time_res":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = SpaceToDepthDownsample(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    stride=(2, 1, 1),
                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown block: {block_name}")
@@ -189,18 +152,10 @@ class Encoder(nn.Module):
            conv_out_channels *= 2
        elif latent_log_var == "uniform":
            conv_out_channels += 1
        elif latent_log_var == "constant":
            conv_out_channels += 1
        elif latent_log_var != "none":
            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
        self.conv_out = make_conv_nd(
-            dims,
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
            output_channel,
            conv_out_channels,
            3,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.gradient_checkpointing = False
@@ -242,15 +197,6 @@ class Encoder(nn.Module):
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            else:
                raise ValueError(f"Invalid input shape: {sample.shape}")
        elif self.latent_log_var == "constant":
            sample = sample[:, :-1, ...]
            approx_ln_0 = (
                -30
            )  # this is the minimal clamp value in DiagonalGaussianDistribution objects
            sample = torch.cat(
                [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
                dim=1,
            )
        return sample
@@ -285,7 +231,7 @@ class Decoder(nn.Module):
        dims,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        layers_per_block: int = 2,
        norm_num_groups: int = 32,
@@ -293,7 +239,6 @@ class Decoder(nn.Module):
        norm_layer: str = "group_norm",
        causal: bool = True,
        timestep_conditioning: bool = False,
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -319,7 +264,6 @@ class Decoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.up_blocks = nn.ModuleList([])
@@ -339,7 +283,6 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "attn_res_x":
                block = UNetMidBlock3D(
@@ -351,7 +294,6 @@ class Decoder(nn.Module):
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
                    attention_head_dim=block_params["attention_head_dim"],
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
@@ -364,21 +306,14 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=False,
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = DepthToSpaceUpsample(
-                    dims=dims,
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
                    in_channels=input_channel,
                    stride=(2, 1, 1),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = DepthToSpaceUpsample(
-                    dims=dims,
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
                    in_channels=input_channel,
                    stride=(1, 2, 2),
                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                output_channel = output_channel // block_params.get("multiplier", 1)
@@ -388,7 +323,6 @@ class Decoder(nn.Module):
                    stride=(2, 2, 2),
                    residual=block_params.get("residual", False),
                    out_channels_reduction_factor=block_params.get("multiplier", 1),
                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown layer: {block_name}")
@@ -406,13 +340,7 @@ class Decoder(nn.Module):
        self.conv_act = nn.SiLU()
        self.conv_out = make_conv_nd(
-            dims,
+            dims, output_channel, out_channels, 3, padding=1, causal=True
            output_channel,
            out_channels,
            3,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.gradient_checkpointing = False
@@ -505,12 +433,6 @@ class UNetMidBlock3D(nn.Module):
        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
        resnet_groups (`int`, *optional*, defaults to 32):
            The number of groups to use in the group normalization layers of the resnet blocks.
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        inject_noise (`bool`, *optional*, defaults to `False`):
            Whether to inject noise into the hidden states.
        timestep_conditioning (`bool`, *optional*, defaults to `False`):
            Whether to condition the hidden states on the timestep.
    Returns:
        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
@@ -529,7 +451,6 @@ class UNetMidBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        resnet_groups = (
@@ -555,17 +476,13 @@ class UNetMidBlock3D(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=inject_noise,
                    timestep_conditioning=timestep_conditioning,
                    spatial_padding_mode=spatial_padding_mode,
                )
                for _ in range(num_layers)
            ]
        )
    def forward(
-        self,
+        self, hidden_states: torch.FloatTensor, causal: bool = True, timestep: Optional[torch.Tensor] = None
        hidden_states: torch.FloatTensor,
        causal: bool = True,
        timestep: Optional[torch.Tensor] = None,
    ) -> torch.FloatTensor:
        timestep_embed = None
        if self.timestep_conditioning:
@@ -590,62 +507,9 @@ class UNetMidBlock3D(nn.Module):
        return hidden_states
 class SpaceToDepthDownsample(nn.Module):
    def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
        super().__init__()
        self.stride = stride
        self.group_size = in_channels * math.prod(stride) // out_channels
        self.conv = make_conv_nd(
            dims=dims,
            in_channels=in_channels,
            out_channels=out_channels // math.prod(stride),
            kernel_size=3,
            stride=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
    def forward(self, x, causal: bool = True):
        if self.stride[0] == 2:
            x = torch.cat(
                [x[:, :, :1, :, :], x], dim=2
            )  # duplicate first frames for padding
        # skip connection
        x_in = rearrange(
            x,
            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
            p1=self.stride[0],
            p2=self.stride[1],
            p3=self.stride[2],
        )
        x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
        x_in = x_in.mean(dim=2)
        # conv
        x = self.conv(x, causal=causal)
        x = rearrange(
            x,
            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
            p1=self.stride[0],
            p2=self.stride[1],
            p3=self.stride[2],
        )
        x = x + x_in
        return x
 class DepthToSpaceUpsample(nn.Module):
    def __init__(
-        self,
+        self, dims, in_channels, stride, residual=False, out_channels_reduction_factor=1
        dims,
        in_channels,
        stride,
        residual=False,
        out_channels_reduction_factor=1,
        spatial_padding_mode="zeros",
    ):
        super().__init__()
        self.stride = stride
@@ -659,7 +523,6 @@ class DepthToSpaceUpsample(nn.Module):
            kernel_size=3,
            stride=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        self.residual = residual
        self.out_channels_reduction_factor = out_channels_reduction_factor
@@ -695,7 +558,7 @@ class DepthToSpaceUpsample(nn.Module):
 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
        super().__init__()
-        self.norm = ops.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
    def forward(self, x):
        x = rearrange(x, "b c d h w -> b d h w c")
@@ -728,7 +591,6 @@ class ResnetBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.in_channels = in_channels
@@ -755,7 +617,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        if inject_noise:
@@ -780,7 +641,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
            spatial_padding_mode=spatial_padding_mode,
        )
        if inject_noise:
@@ -941,44 +801,9 @@ class processor(nn.Module):
        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)
 class VideoVAE(nn.Module):
-    def __init__(self, version=0, config=None):
+    def __init__(self, version=0):
        super().__init__()
        if config is None:
            config = self.guess_config(version)
        self.timestep_conditioning = config.get("timestep_conditioning", False)
        double_z = config.get("double_z", True)
        latent_log_var = config.get(
            "latent_log_var", "per_channel" if double_z else "none"
        )
        self.encoder = Encoder(
            dims=config["dims"],
            in_channels=config.get("in_channels", 3),
            out_channels=config["latent_channels"],
            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
            patch_size=config.get("patch_size", 1),
            latent_log_var=latent_log_var,
            norm_layer=config.get("norm_layer", "group_norm"),
            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
        )
        self.decoder = Decoder(
            dims=config["dims"],
            in_channels=config["latent_channels"],
            out_channels=config.get("out_channels", 3),
            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
            patch_size=config.get("patch_size", 1),
            norm_layer=config.get("norm_layer", "group_norm"),
            causal=config.get("causal_decoder", False),
            timestep_conditioning=self.timestep_conditioning,
            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
        )
        self.per_channel_statistics = processor()
    def guess_config(self, version):
        if version == 0:
            config = {
                "_class_name": "CausalVideoAutoencoder",
@@ -1005,7 +830,7 @@ class VideoVAE(nn.Module):
                "use_quant_conv": False,
                "causal_decoder": False,
            }
-        elif version == 1:
+        else:
            config = {
                "_class_name": "CausalVideoAutoencoder",
                "dims": 3,
@@ -1041,47 +866,37 @@ class VideoVAE(nn.Module):
                "causal_decoder": False,
                "timestep_conditioning": True,
            }
-        else:
+
-            config = {
+        double_z = config.get("double_z", True)
-                "_class_name": "CausalVideoAutoencoder",
+        latent_log_var = config.get(
-                "dims": 3,
+            "latent_log_var", "per_channel" if double_z else "none"
-                "in_channels": 3,
+        )
-                "out_channels": 3,
+
-                "latent_channels": 128,
+        self.encoder = Encoder(
-                "encoder_blocks": [
+            dims=config["dims"],
-                    ["res_x", {"num_layers": 4}],
+            in_channels=config.get("in_channels", 3),
-                    ["compress_space_res", {"multiplier": 2}],
+            out_channels=config["latent_channels"],
-                    ["res_x", {"num_layers": 6}],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
-                    ["compress_time_res", {"multiplier": 2}],
+            patch_size=config.get("patch_size", 1),
-                    ["res_x", {"num_layers": 6}],
+            latent_log_var=latent_log_var,
-                    ["compress_all_res", {"multiplier": 2}],
+            norm_layer=config.get("norm_layer", "group_norm"),
-                    ["res_x", {"num_layers": 2}],
+        )
-                    ["compress_all_res", {"multiplier": 2}],
+
-                    ["res_x", {"num_layers": 2}]
+        self.decoder = Decoder(
-                ],
+            dims=config["dims"],
-                "decoder_blocks": [
+            in_channels=config["latent_channels"],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+            out_channels=config.get("out_channels", 3),
-                    ["compress_all", {"residual": True, "multiplier": 2}],
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+            patch_size=config.get("patch_size", 1),
-                    ["compress_all", {"residual": True, "multiplier": 2}],
+            norm_layer=config.get("norm_layer", "group_norm"),
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+            causal=config.get("causal_decoder", False),
-                    ["compress_all", {"residual": True, "multiplier": 2}],
+            timestep_conditioning=config.get("timestep_conditioning", False),
-                    ["res_x", {"num_layers": 5, "inject_noise": False}]
+        )
-                ],
+
-                "scaling_factor": 1.0,
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
-                "norm_layer": "pixel_norm",
+        self.per_channel_statistics = processor()
                "patch_size": 4,
                "latent_log_var": "uniform",
                "use_quant_conv": False,
                "causal_decoder": False,
                "timestep_conditioning": True
            }
        return config
    def encode(self, x):
        frames_count = x.shape[2]
        if ((frames_count - 1) % 8) != 0:
            raise ValueError("Invalid number of frames: Encode input must have 1 + 8 * x frames (e.g., 1, 9, 17, ...). Please check your input.")
        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
        return self.per_channel_statistics.normalize(means)
--- a/comfy/ldm/lightricks/vae/conv_nd_factory.py
+++ b/comfy/ldm/lightricks/vae/conv_nd_factory.py
@@ -17,11 +17,7 @@ def make_conv_nd(
    groups=1,
    bias=True,
    causal=False,
    spatial_padding_mode="zeros",
    temporal_padding_mode="zeros",
 ):
    if not (spatial_padding_mode == temporal_padding_mode or causal):
        raise NotImplementedError("spatial and temporal padding modes must be equal")
    if dims == 2:
        return ops.Conv2d(
            in_channels=in_channels,
@@ -32,7 +28,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
            padding_mode=spatial_padding_mode,
        )
    elif dims == 3:
        if causal:
@@ -45,7 +40,6 @@ def make_conv_nd(
                dilation=dilation,
                groups=groups,
                bias=bias,
                spatial_padding_mode=spatial_padding_mode,
            )
        return ops.Conv3d(
            in_channels=in_channels,
@@ -56,7 +50,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
            padding_mode=spatial_padding_mode,
        )
    elif dims == (2, 1):
        return DualConv3d(
@@ -66,7 +59,6 @@ def make_conv_nd(
            stride=stride,
            padding=padding,
            bias=bias,
            padding_mode=spatial_padding_mode,
        )
    else:
        raise ValueError(f"unsupported dimensions: {dims}")
--- a/comfy/ldm/lightricks/vae/dual_conv3d.py
+++ b/comfy/ldm/lightricks/vae/dual_conv3d.py
@@ -18,13 +18,11 @@ class DualConv3d(nn.Module):
        dilation: Union[int, Tuple[int, int, int]] = 1,
        groups=1,
        bias=True,
        padding_mode="zeros",
    ):
        super(DualConv3d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.padding_mode = padding_mode
        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size, kernel_size)
@@ -110,7 +108,6 @@ class DualConv3d(nn.Module):
            self.padding1,
            self.dilation1,
            self.groups,
            padding_mode=self.padding_mode,
        )
        if skip_time_conv:
@@ -125,7 +122,6 @@ class DualConv3d(nn.Module):
            self.padding2,
            self.dilation2,
            self.groups,
            padding_mode=self.padding_mode,
        )
        return x
@@ -141,16 +137,7 @@ class DualConv3d(nn.Module):
        stride1 = (self.stride1[1], self.stride1[2])
        padding1 = (self.padding1[1], self.padding1[2])
        dilation1 = (self.dilation1[1], self.dilation1[2])
-        x = F.conv2d(
+        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)
            x,
            weight1,
            self.bias1,
            stride1,
            padding1,
            dilation1,
            self.groups,
            padding_mode=self.padding_mode,
        )
        _, _, h, w = x.shape
@@ -167,16 +154,7 @@ class DualConv3d(nn.Module):
        stride2 = self.stride2[0]
        padding2 = self.padding2[0]
        dilation2 = self.dilation2[0]
-        x = F.conv1d(
+        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
            x,
            weight2,
            self.bias2,
            stride2,
            padding2,
            dilation2,
            self.groups,
            padding_mode=self.padding_mode,
        )
        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)
        return x
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -108,7 +108,7 @@ class BaseModel(torch.nn.Module):
        if not unet_config.get("disable_unet_model_creation", False):
            if model_config.custom_operations is None:
-                fp8 = model_config.optimizations.get("fp8", False)
+                fp8 = model_config.optimizations.get("fp8", model_config.scaled_fp8 is not None)
                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
            else:
                operations = model_config.custom_operations
@@ -161,13 +161,9 @@ class BaseModel(torch.nn.Module):
                    extra = extra.to(dtype)
            extra_conds[o] = extra
        t = self.process_timestep(t, x=x, **extra_conds)
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
        return self.model_sampling.calculate_denoised(sigma, model_output, x)
    def process_timestep(self, timestep, **kwargs):
        return timestep
    def get_dtype(self):
        return self.diffusion_model.dtype
@@ -189,11 +185,6 @@ class BaseModel(torch.nn.Module):
            if concat_latent_image.shape[1:] != noise.shape[1:]:
                concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
                if noise.ndim == 5:
                    if concat_latent_image.shape[-3] < noise.shape[-3]:
                        concat_latent_image = torch.nn.functional.pad(concat_latent_image, (0, 0, 0, 0, 0, noise.shape[-3] - concat_latent_image.shape[-3]), "constant", 0)
                    else:
                        concat_latent_image = concat_latent_image[:, :, :noise.shape[-3]]
            concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])
@@ -222,11 +213,6 @@ class BaseModel(torch.nn.Module):
                        cond_concat.append(self.blank_inpaint_image_like(noise))
                    elif ck == "mask_inverted":
                        cond_concat.append(torch.zeros_like(noise)[:, :1])
                if ck == "concat_image":
                    if concat_latent_image is not None:
                        cond_concat.append(concat_latent_image.to(device))
                    else:
                        cond_concat.append(torch.zeros_like(noise))
            data = torch.cat(cond_concat, dim=1)
            return data
        return None
@@ -859,26 +845,17 @@ class LTXV(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        guiding_latent = kwargs.get("guiding_latent", None)
        if guiding_latent is not None:
            out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
        guiding_latent_noise_scale = kwargs.get("guiding_latent_noise_scale", None)
        if guiding_latent_noise_scale is not None:
            out["guiding_latent_noise_scale"] = comfy.conds.CONDConstant(guiding_latent_noise_scale)
        out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
        if denoise_mask is not None:
            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
        keyframe_idxs = kwargs.get("keyframe_idxs", None)
        if keyframe_idxs is not None:
            out['keyframe_idxs'] = comfy.conds.CONDRegular(keyframe_idxs)
        return out
    def process_timestep(self, timestep, x, denoise_mask=None, **kwargs):
        if denoise_mask is None:
            return timestep
        return self.diffusion_model.patchifier.patchify(((denoise_mask) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1)))[:, :1])[0]
    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
        return latent_image
 class HunyuanVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo)
@@ -895,35 +872,20 @@ class HunyuanVideo(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        if image is not None:
            padding_shape = (noise.shape[0], 16, noise.shape[2] - 1, noise.shape[3], noise.shape[4])
            latent_padding = torch.zeros(padding_shape, device=noise.device, dtype=noise.dtype)
            image_latents = torch.cat([image.to(noise), latent_padding], dim=2)
            out['c_concat'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_latents))
        guidance = kwargs.get("guidance", 6.0)
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        guiding_frame_index = kwargs.get("guiding_frame_index", None)
        if guiding_frame_index is not None:
            out['guiding_frame_index'] = comfy.conds.CONDRegular(torch.FloatTensor([guiding_frame_index]))
        return out
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return latent_image
 class HunyuanVideoI2V(HunyuanVideo):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
        self.concat_keys = ("concat_image", "mask_inverted")
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
 class HunyuanVideoSkyreelsI2V(HunyuanVideo):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
        self.concat_keys = ("concat_image",)
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
 class CosmosVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EDM, image_to_video=False, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cosmos.model.GeneralDIT)
@@ -973,11 +935,11 @@ class WAN21(BaseModel):
        self.image_to_video = image_to_video
    def concat_cond(self, **kwargs):
-        noise = kwargs.get("noise", None)
+        if not self.image_to_video:
        if self.diffusion_model.patch_embedding.weight.shape[1] == noise.shape[1]:
            return None
        image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        device = kwargs["device"]
        if image is None:
@@ -987,9 +949,6 @@ class WAN21(BaseModel):
        image = self.process_latent_in(image)
        image = utils.resize_to_batch_size(image, noise.shape[0])
        if not self.image_to_video:
            return image
        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
        if mask is None:
            mask = torch.zeros_like(noise)[:, :4]
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -1,4 +1,3 @@
 import json
 import comfy.supported_models
 import comfy.supported_models_base
 import comfy.utils
@@ -34,7 +33,7 @@ def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
        return last_transformer_depth, context_dim, use_linear_in_transformer, time_stack, time_stack_cross
    return None
-def detect_unet_config(state_dict, key_prefix, metadata=None):
+def detect_unet_config(state_dict, key_prefix):
    state_dict_keys = list(state_dict.keys())
    if '{}joint_blocks.0.context_block.attn.qkv.weight'.format(key_prefix) in state_dict_keys: #mmdit model
@@ -211,8 +210,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
    if '{}adaln_single.emb.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: #Lightricks ltxv
        dit_config = {}
        dit_config["image_model"] = "ltxv"
        if metadata is not None and "config" in metadata:
            dit_config.update(json.loads(metadata["config"]).get("transformer", {}))
        return dit_config
    if '{}t_block.1.weight'.format(key_prefix) in state_dict_keys: # PixArt
@@ -457,8 +454,8 @@ def model_config_from_unet_config(unet_config, state_dict=None):
    logging.error("no match {}".format(unet_config))
    return None
-def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False, metadata=None):
+def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False):
-    unet_config = detect_unet_config(state_dict, unet_key_prefix, metadata=metadata)
+    unet_config = detect_unet_config(state_dict, unet_key_prefix)
    if unet_config is None:
        return None
    model_config = model_config_from_unet_config(unet_config, state_dict)
@@ -471,10 +468,6 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
        model_config.scaled_fp8 = scaled_fp8_weight.dtype
        if model_config.scaled_fp8 == torch.float32:
            model_config.scaled_fp8 = torch.float8_e4m3fn
        if scaled_fp8_weight.nelement() == 2:
            model_config.optimizations["fp8"] = False
        else:
            model_config.optimizations["fp8"] = True
    return model_config
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -186,21 +186,12 @@ def get_total_memory(dev=None, torch_total_too=False):
    else:
        return mem_total
 def mac_version():
    try:
        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
    except:
        return None
 total_vram = get_total_memory(get_torch_device()) / (1024 * 1024)
 total_ram = psutil.virtual_memory().total / (1024 * 1024)
 logging.info("Total VRAM {:0.0f} MB, total RAM {:0.0f} MB".format(total_vram, total_ram))
 try:
    logging.info("pytorch version: {}".format(torch_version))
    mac_ver = mac_version()
    if mac_ver is not None:
        logging.info("Mac Version {}".format(mac_ver))
 except:
    pass
@@ -590,7 +581,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
            loaded_memory = loaded_model.model_loaded_memory()
            current_free_mem = get_free_memory(torch_dev) + loaded_memory
-            lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
+            lowvram_model_memory = max(64 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
            lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory)
        if vram_set_state == VRAMState.NO_VRAM:
@@ -978,6 +969,12 @@ def pytorch_attention_flash_attention():
            return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention
    return False
 def mac_version():
    try:
        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
    except:
        return None
 def force_upcast_attention_dtype():
    upcast = args.force_upcast_attention
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1089,6 +1089,7 @@ class ModelPatcher:
    def patch_hooks(self, hooks: comfy.hooks.HookGroup):
        with self.use_ejected():
            self.unpatch_hooks()
            if hooks is not None:
                model_sd_keys = list(self.model_state_dict().keys())
                memory_counter = None
@@ -1099,16 +1100,12 @@ class ModelPatcher:
                # if have cached weights for hooks, use it
                cached_weights = self.cached_hook_patches.get(hooks, None)
                if cached_weights is not None:
                    model_sd_keys_set = set(model_sd_keys)
                    for key in cached_weights:
                        if key not in model_sd_keys:
                            logging.warning(f"Cached hook could not patch. Key does not exist in model: {key}")
                            continue
                        self.patch_cached_hook_weights(cached_weights=cached_weights, key=key, memory_counter=memory_counter)
                        model_sd_keys_set.remove(key)
                    self.unpatch_hooks(model_sd_keys_set)
                else:
                    self.unpatch_hooks()
                    relevant_patches = self.get_combined_hook_patches(hooks=hooks)
                    original_weights = None
                    if len(relevant_patches) > 0:
@@ -1119,8 +1116,6 @@ class ModelPatcher:
                            continue
                        self.patch_hook_weight_to_device(hooks=hooks, combined_patches=relevant_patches, key=key, original_weights=original_weights,
                                                            memory_counter=memory_counter)
            else:
                self.unpatch_hooks()
            self.current_hooks = hooks
    def patch_cached_hook_weights(self, cached_weights: dict, key: str, memory_counter: MemoryCounter):
@@ -1177,23 +1172,17 @@ class ModelPatcher:
        del out_weight
        del weight
-    def unpatch_hooks(self, whitelist_keys_set: set[str]=None) -> None:
+    def unpatch_hooks(self) -> None:
        with self.use_ejected():
            if len(self.hook_backup) == 0:
                self.current_hooks = None
                return
            keys = list(self.hook_backup.keys())
-            if whitelist_keys_set:
+            for k in keys:
-                for k in keys:
+                comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
                    if k in whitelist_keys_set:
                        comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
                        self.hook_backup.pop(k)
            else:
                for k in keys:
                    comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
-                self.hook_backup.clear()
+            self.hook_backup.clear()
-                self.current_hooks = None
+            self.current_hooks = None
    def clean_hooks(self):
        self.unpatch_hooks()
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -17,7 +17,6 @@
 """
 import torch
 import logging
 import comfy.model_management
 from comfy.cli_args import args, PerformanceFeature
 import comfy.float
@@ -309,7 +308,6 @@ class fp8_ops(manual_cast):
            return torch.nn.functional.linear(input, weight, bias)
 def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
    logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
    class scaled_fp8_op(manual_cast):
        class Linear(manual_cast.Linear):
            def __init__(self, *args, **kwargs):
@@ -360,7 +358,7 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None):
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
    if scaled_fp8 is not None:
-        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
+        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute, scale_input=True, override_dtype=scaled_fp8)
    if (
        fp8_compute and
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -19,12 +19,6 @@ import comfy.hooks
 import scipy.stats
 import numpy
 def add_area_dims(area, num_dims):
    while (len(area) // 2) < num_dims:
        area = [2147483648] + area[:len(area) // 2] + [0] + area[len(area) // 2:]
    return area
 def get_area_and_mult(conds, x_in, timestep_in):
    dims = tuple(x_in.shape[2:])
    area = None
@@ -40,9 +34,8 @@ def get_area_and_mult(conds, x_in, timestep_in):
            return None
    if 'area' in conds:
        area = list(conds['area'])
-        area = add_area_dims(area, len(dims))
+        while (len(area) // 2) < len(dims):
-        if (len(area) // 2) > len(dims):
+            area = [2147483648] + area[:len(area) // 2] + [0] + area[len(area) // 2:]
            area = area[:len(dims)] + area[len(area) // 2:(len(area) // 2) + len(dims)]
    if 'strength' in conds:
        strength = conds['strength']
@@ -60,7 +53,7 @@ def get_area_and_mult(conds, x_in, timestep_in):
        if "mask_strength" in conds:
            mask_strength = conds["mask_strength"]
        mask = conds['mask']
-        assert (mask.shape[1:] == x_in.shape[2:])
+        assert(mask.shape[1:] == x_in.shape[2:])
        mask = mask[:input_x.shape[0]]
        if area is not None:
@@ -74,17 +67,16 @@ def get_area_and_mult(conds, x_in, timestep_in):
    mult = mask * strength
    if 'mask' not in conds and area is not None:
-        fuzz = 8
+        rr = 8
        for i in range(len(dims)):
            rr = min(fuzz, mult.shape[2 + i] // 4)
            if area[len(dims) + i] != 0:
                for t in range(rr):
                    m = mult.narrow(i + 2, t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))
            if (area[i] + area[len(dims) + i]) < x_in.shape[i + 2]:
                for t in range(rr):
                    m = mult.narrow(i + 2, area[i] - 1 - t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))
    conditioning = {}
    model_conds = conds["model_conds"]
@@ -559,37 +551,25 @@ def resolve_areas_and_cond_masks(conditions, h, w, device):
    logging.warning("WARNING: The comfy.samplers.resolve_areas_and_cond_masks function is deprecated please use the resolve_areas_and_cond_masks_multidim one instead.")
    return resolve_areas_and_cond_masks_multidim(conditions, [h, w], device)
-def create_cond_with_same_area_if_none(conds, c):
+def create_cond_with_same_area_if_none(conds, c): #TODO: handle dim != 2
    if 'area' not in c:
        return
    def area_inside(a, area_cmp):
        a = add_area_dims(a, len(area_cmp) // 2)
        area_cmp = add_area_dims(area_cmp, len(a) // 2)
        a_l = len(a) // 2
        area_cmp_l = len(area_cmp) // 2
        for i in range(min(a_l, area_cmp_l)):
            if a[a_l + i] < area_cmp[area_cmp_l + i]:
                return False
        for i in range(min(a_l, area_cmp_l)):
            if (a[i] + a[a_l + i]) > (area_cmp[i] + area_cmp[area_cmp_l + i]):
                return False
        return True
    c_area = c['area']
    smallest = None
    for x in conds:
        if 'area' in x:
            a = x['area']
-            if area_inside(c_area, a):
+            if c_area[2] >= a[2] and c_area[3] >= a[3]:
-                if smallest is None:
+                if a[0] + a[2] >= c_area[0] + c_area[2]:
-                    smallest = x
+                    if a[1] + a[3] >= c_area[1] + c_area[3]:
-                elif 'area' not in smallest:
+                        if smallest is None:
-                    smallest = x
+                            smallest = x
-                else:
+                        elif 'area' not in smallest:
-                    if math.prod(smallest['area'][:len(smallest['area']) // 2]) > math.prod(a[:len(a) // 2]):
+                            smallest = x
-                        smallest = x
+                        else:
                            if smallest['area'][0] * smallest['area'][1] > a[0] * a[1]:
                                smallest = x
        else:
            if smallest is None:
                smallest = x
@@ -710,7 +690,7 @@ KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_c
                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
                  "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp",
-                  "gradient_estimation", "er_sde"]
+                  "gradient_estimation"]
 class KSAMPLER(Sampler):
    def __init__(self, sampler_function, extra_options={}, inpaint_options={}):
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
 import json
 import torch
 from enum import Enum
 import logging
@@ -135,8 +134,8 @@ class CLIP:
    def clip_layer(self, layer_idx):
        self.layer_idx = layer_idx
-    def tokenize(self, text, return_word_ids=False, **kwargs):
+    def tokenize(self, text, return_word_ids=False):
-        return self.tokenizer.tokenize_with_weights(text, return_word_ids, **kwargs)
+        return self.tokenizer.tokenize_with_weights(text, return_word_ids)
    def add_hooks_to_dict(self, pooled_dict: dict[str]):
        if self.apply_hooks_to_conds:
@@ -250,7 +249,7 @@ class CLIP:
        return self.patcher.get_key_patches()
 class VAE:
-    def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None):
+    def __init__(self, sd=None, device=None, config=None, dtype=None):
        if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
            sd = diffusers_convert.convert_vae_state_dict(sd)
@@ -358,12 +357,7 @@ class VAE:
                    version = 0
                elif tensor_conv1.shape[0] == 1024:
                    version = 1
-                    if "encoder.down_blocks.1.conv.conv.bias" in sd:
+                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version)
                        version = 2
                vae_config = None
                if metadata is not None and "config" in metadata:
                    vae_config = json.loads(metadata["config"]).get("vae", None)
                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version, config=vae_config)
                self.latent_channels = 128
                self.latent_dim = 3
                self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
@@ -879,13 +873,13 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
    return (model, clip, vae)
 def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
-    sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
+    sd = comfy.utils.load_torch_file(ckpt_path)
-    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
+    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
    return out
-def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
+def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    clip = None
    clipvision = None
    vae = None
@@ -897,7 +891,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
    load_device = model_management.get_torch_device()
-    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
+    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
    if model_config is None:
        return None
@@ -926,7 +920,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    if output_vae:
        vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True)
        vae_sd = model_config.process_vae_state_dict(vae_sd)
-        vae = VAE(sd=vae_sd, metadata=metadata)
+        vae = VAE(sd=vae_sd)
    if output_clip:
        clip_target = model_config.clip_target(state_dict=sd)
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -158,93 +158,71 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        self.layer_idx = self.options_default[1]
        self.return_projected_pooled = self.options_default[2]
-    def process_tokens(self, tokens, device):
+    def set_up_textual_embeddings(self, tokens, current_embeds):
-        end_token = self.special_tokens.get("end", None)
+        out_tokens = []
-        if end_token is None:
+        next_new_token = token_dict_size = current_embeds.weight.shape[0]
-            cmp_token = self.special_tokens.get("pad", -1)
+        embedding_weights = []
        else:
            cmp_token = end_token
        embeds_out = []
        attention_masks = []
        num_tokens = []
        for x in tokens:
            attention_mask = []
            tokens_temp = []
            other_embeds = []
            eos = False
            index = 0
            for y in x:
                if isinstance(y, numbers.Integral):
-                    if eos:
+                    tokens_temp += [int(y)]
-                        attention_mask.append(0)
+                else:
                    if y.shape[0] == current_embeds.weight.shape[1]:
                        embedding_weights += [y]
                        tokens_temp += [next_new_token]
                        next_new_token += 1
                    else:
-                        attention_mask.append(1)
+                        logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(y.shape[0], current_embeds.weight.shape[1]))
-                    token = int(y)
+            while len(tokens_temp) < len(x):
-                    tokens_temp += [token]
+                tokens_temp += [self.special_tokens["pad"]]
-                    if not eos and token == cmp_token:
+            out_tokens += [tokens_temp]
                        if end_token is None:
                            attention_mask[-1] = 0
                        eos = True
                else:
                    other_embeds.append((index, y))
                index += 1
-            tokens_embed = torch.tensor([tokens_temp], device=device, dtype=torch.long)
+        n = token_dict_size
-            tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32)
+        if len(embedding_weights) > 0:
-            index = 0
+            new_embedding = self.operations.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
-            pad_extra = 0
+            new_embedding.weight[:token_dict_size] = current_embeds.weight
-            for o in other_embeds:
+            for x in embedding_weights:
-                emb = o[1]
+                new_embedding.weight[n] = x
-                if torch.is_tensor(emb):
+                n += 1
-                    emb = {"type": "embedding", "data": emb}
+            self.transformer.set_input_embeddings(new_embedding)
-                emb_type = emb.get("type", None)
+        processed_tokens = []
-                if emb_type == "embedding":
+        for x in out_tokens:
-                    emb = emb.get("data", None)
+            processed_tokens += [list(map(lambda a: n if a == -1 else a, x))] #The EOS token should always be the largest one
                else:
                    if hasattr(self.transformer, "preprocess_embed"):
                        emb = self.transformer.preprocess_embed(emb, device=device)
                    else:
                        emb = None
-                if emb is None:
+        return processed_tokens
                    index += -1
                    continue
                ind = index + o[0]
                emb = emb.view(1, -1, emb.shape[-1]).to(device=device, dtype=torch.float32)
                emb_shape = emb.shape[1]
                if emb.shape[-1] == tokens_embed.shape[-1]:
                    tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1)
                    attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:]
                    index += emb_shape - 1
                else:
                    index += -1
                    pad_extra += emb_shape
                    logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(emb.shape[-1], tokens_embed.shape[-1]))
            if pad_extra > 0:
                padd_embed = self.transformer.get_input_embeddings()(torch.tensor([[self.special_tokens["pad"]] * pad_extra], device=device, dtype=torch.long), out_dtype=torch.float32)
                tokens_embed = torch.cat([tokens_embed, padd_embed], dim=1)
                attention_mask = attention_mask + [0] * pad_extra
            embeds_out.append(tokens_embed)
            attention_masks.append(attention_mask)
            num_tokens.append(sum(attention_mask))
        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens
    def forward(self, tokens):
-        device = self.transformer.get_input_embeddings().weight.device
+        backup_embeds = self.transformer.get_input_embeddings()
-        embeds, attention_mask, num_tokens = self.process_tokens(tokens, device)
+        device = backup_embeds.weight.device
        tokens = self.set_up_textual_embeddings(tokens, backup_embeds)
        tokens = torch.LongTensor(tokens).to(device)
        attention_mask = None
        if self.enable_attention_masks or self.zero_out_masked or self.return_attention_masks:
            attention_mask = torch.zeros_like(tokens)
            end_token = self.special_tokens.get("end", None)
            if end_token is None:
                cmp_token = self.special_tokens.get("pad", -1)
            else:
                cmp_token = end_token
            for x in range(attention_mask.shape[0]):
                for y in range(attention_mask.shape[1]):
                    attention_mask[x, y] = 1
                    if tokens[x, y] == cmp_token:
                        if end_token is None:
                            attention_mask[x, y] = 0
                        break
        attention_mask_model = None
        if self.enable_attention_masks:
            attention_mask_model = attention_mask
-        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
        self.transformer.set_input_embeddings(backup_embeds)
        if self.layer == "last":
            z = outputs[0].float()
@@ -504,7 +482,7 @@ class SDTokenizer:
        return (embed, leftover)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        '''
        Takes a prompt and converts it to a list of (token, weight, word id) elements.
        Tokens can both be integer tokens and pre computed CLIP tensors.
@@ -618,7 +596,7 @@ class SD1Tokenizer:
        tokenizer = tokenizer_data.get("{}_tokenizer_class".format(self.clip), tokenizer)
        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data))
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out[self.clip_name] = getattr(self, self.clip).tokenize_with_weights(text, return_word_ids)
        return out
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@@ -26,7 +26,7 @@ class SDXLTokenizer:
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -762,7 +762,7 @@ class LTXV(supported_models_base.BASE):
    unet_extra_config = {}
    latent_format = latent_formats.LTXV
-    memory_usage_factor = 5.5 # TODO: img2vid is about 2x vs txt2vid
+    memory_usage_factor = 2.7
    supported_inference_dtypes = [torch.bfloat16, torch.float32]
@@ -826,26 +826,6 @@ class HunyuanVideo(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}llama.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer, comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**hunyuan_detect))
 class HunyuanVideoI2V(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
        "in_channels": 33,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HunyuanVideoI2V(self, device=device)
        return out
 class HunyuanVideoSkyreelsI2V(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
        "in_channels": 32,
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HunyuanVideoSkyreelsI2V(self, device=device)
        return out
 class CosmosT2V(supported_models_base.BASE):
    unet_config = {
        "image_model": "cosmos",
@@ -931,7 +911,7 @@ class WAN21_T2V(supported_models_base.BASE):
    memory_usage_factor = 1.0
-    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
@@ -959,6 +939,6 @@ class WAN21_I2V(WAN21_T2V):
        out = model_base.WAN21(self, image_to_video=True, device=device)
        return out
-models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V]
+models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V]
 models += [SVD_img2vid]
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -93,11 +93,8 @@ class BertEmbeddings(torch.nn.Module):
        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)
-    def forward(self, input_tokens, embeds=None, token_type_ids=None, dtype=None):
+    def forward(self, input_tokens, token_type_ids=None, dtype=None):
-        if embeds is not None:
+        x = self.word_embeddings(input_tokens, out_dtype=dtype)
            x = embeds
        else:
            x = self.word_embeddings(input_tokens, out_dtype=dtype)
        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
        if token_type_ids is not None:
            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
@@ -116,8 +113,8 @@ class BertModel_(torch.nn.Module):
        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)
-    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype)
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -18,7 +18,7 @@ class FluxTokenizer:
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@@ -4,7 +4,6 @@ import comfy.text_encoders.llama
 from transformers import LlamaTokenizerFast
 import torch
 import os
 import numbers
 def llama_detect(state_dict, prefix=""):
@@ -23,7 +22,7 @@ def llama_detect(state_dict, prefix=""):
 class LLAMA3Tokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}, min_length=256):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "llama_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=128258, min_length=min_length)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=128258, end_token=128009, min_length=min_length)
 class LLAMAModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}):
@@ -39,26 +38,15 @@ class HunyuanVideoTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
-        self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"""  # 95 tokens
+        self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"""  # 95 tokens
        self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1)
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, image_embeds=None, image_interleave=1, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
-        if llama_template is None:
+        llama_text = "{}{}".format(self.llama_template, text)
-            llama_text = self.llama_template.format(text)
+        out["llama"] = self.llama.tokenize_with_weights(llama_text, return_word_ids)
        else:
            llama_text = llama_template.format(text)
        llama_text_tokens = self.llama.tokenize_with_weights(llama_text, return_word_ids)
        embed_count = 0
        for r in llama_text_tokens:
            for i in range(len(r)):
                if r[i][0] == 128257:
                    if image_embeds is not None and embed_count < image_embeds.shape[0]:
                        r[i] = ({"type": "embedding", "data": image_embeds[embed_count], "original_type": "image", "image_interleave": image_interleave},) + r[i][1:]
                        embed_count += 1
        out["llama"] = llama_text_tokens
        return out
    def untokenize(self, token_weight_pair):
@@ -92,51 +80,20 @@ class HunyuanVideoClipModel(torch.nn.Module):
        llama_out, llama_pooled, llama_extra_out = self.llama.encode_token_weights(token_weight_pairs_llama)
        template_end = 0
-        extra_template_end = 0
+        for i, v in enumerate(token_weight_pairs_llama[0]):
-        extra_sizes = 0
+            if v[0] == 128007:  # <|end_header_id|>
-        user_end = 9999999999999
+                template_end = i
        images = []
        tok_pairs = token_weight_pairs_llama[0]
        for i, v in enumerate(tok_pairs):
            elem = v[0]
            if not torch.is_tensor(elem):
                if isinstance(elem, numbers.Integral):
                    if elem == 128006:
                        if tok_pairs[i + 1][0] == 882:
                            if tok_pairs[i + 2][0] == 128007:
                                template_end = i + 2
                                user_end = -1
                    if elem == 128009 and user_end == -1:
                        user_end = i + 1
                else:
                    if elem.get("original_type") == "image":
                        elem_size = elem.get("data").shape[0]
                        if template_end > 0:
                            if user_end == -1:
                                extra_template_end += elem_size - 1
                        else:
                            image_start = i + extra_sizes
                            image_end = i + elem_size + extra_sizes
                            images.append((image_start, image_end, elem.get("image_interleave", 1)))
                            extra_sizes += elem_size - 1
        if llama_out.shape[1] > (template_end + 2):
-            if tok_pairs[template_end + 1][0] == 271:
+            if token_weight_pairs_llama[0][template_end + 1][0] == 271:
                template_end += 2
-        llama_output = llama_out[:, template_end + extra_sizes:user_end + extra_sizes + extra_template_end]
+        llama_out = llama_out[:, template_end:]
-        llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end + extra_sizes:user_end + extra_sizes + extra_template_end]
+        llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end:]
        if llama_extra_out["attention_mask"].sum() == torch.numel(llama_extra_out["attention_mask"]):
            llama_extra_out.pop("attention_mask")  # attention mask is useless if no masked elements
        if len(images) > 0:
            out = []
            for i in images:
                out.append(llama_out[:, i[0]: i[1]: i[2]])
            llama_output = torch.cat(out + [llama_output], dim=1)
        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
-        return llama_output, l_pooled, llama_extra_out
+        return llama_out, l_pooled, llama_extra_out
    def load_sd(self, sd):
        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
--- a/comfy/text_encoders/hydit.py
+++ b/comfy/text_encoders/hydit.py
@@ -37,7 +37,7 @@ class HyditTokenizer:
        self.hydit_clip = HyditBertTokenizer(embedding_directory=embedding_directory)
        self.mt5xl = MT5XLTokenizer(tokenizer_data={"spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["hydit_clip"] = self.hydit_clip.tokenize_with_weights(text, return_word_ids)
        out["mt5xl"] = self.mt5xl.tokenize_with_weights(text, return_word_ids)
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -241,11 +241,8 @@ class Llama2_(nn.Module):
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
-    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        if embeds is not None:
+        x = self.embed_tokens(x, out_dtype=dtype)
            x = embeds
        else:
            x = self.embed_tokens(x, out_dtype=dtype)
        if self.normalize_in:
            x *= self.config.hidden_size ** 0.5
--- a/comfy/text_encoders/sd3_clip.py
+++ b/comfy/text_encoders/sd3_clip.py
@@ -43,7 +43,7 @@ class SD3Tokenizer:
        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory)
        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@@ -239,11 +239,8 @@ class T5(torch.nn.Module):
    def set_input_embeddings(self, embeddings):
        self.shared = embeddings
-    def forward(self, input_ids, attention_mask, embeds=None, num_tokens=None, **kwargs):
+    def forward(self, input_ids, *args, **kwargs):
-        if input_ids is None:
+        x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
            x = embeds
        else:
            x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
        if self.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
            x = torch.nan_to_num(x) #Fix for fp8 T5 base
-        return self.encoder(x, attention_mask=attention_mask, **kwargs)
+        return self.encoder(x, *args, **kwargs)
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -46,18 +46,12 @@ if hasattr(torch.serialization, "add_safe_globals"):  # TODO: this was added in
 else:
    logging.info("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended.")
-def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
+def load_torch_file(ckpt, safe_load=False, device=None):
    if device is None:
        device = torch.device("cpu")
    metadata = None
    if ckpt.lower().endswith(".safetensors") or ckpt.lower().endswith(".sft"):
        try:
-            with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
+            sd = safetensors.torch.load_file(ckpt, device=device.type)
                sd = {}
                for k in f.keys():
                    sd[k] = f.get_tensor(k)
                if return_metadata:
                    metadata = f.metadata()
        except Exception as e:
            if len(e.args) > 0:
                message = e.args[0]
@@ -83,7 +77,7 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
                    sd = pl_sd
            else:
                sd = pl_sd
-    return (sd, metadata) if return_metadata else sd
+    return sd
 def save_torch_file(sd, ckpt, metadata=None):
    if metadata is not None:
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -1,5 +1,3 @@
 from __future__ import annotations
 import torchaudio
 import torch
 import comfy.model_management
@@ -12,7 +10,6 @@ import random
 import hashlib
 import node_helpers
 from comfy.cli_args import args
 from comfy.comfy_types import FileLocator
 class EmptyLatentAudio:
    def __init__(self):
@@ -167,7 +164,7 @@ class SaveAudio:
    def save_audio(self, audio, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
        filename_prefix += self.prefix_append
        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
-        results: list[FileLocator] = []
+        results = list()
        metadata = {}
        if not args.disable_metadata:
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -454,7 +454,7 @@ class SamplerCustom:
        return {"required":
                    {"model": ("MODEL",),
                    "add_noise": ("BOOLEAN", {"default": True}),
-                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True}),
+                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
                    "positive": ("CONDITIONING", ),
                    "negative": ("CONDITIONING", ),
@@ -605,16 +605,10 @@ class DisableNoise:
 class RandomNoise(DisableNoise):
    @classmethod
    def INPUT_TYPES(s):
-        return {
+        return {"required":{
-            "required": {
+                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
-                "noise_seed": ("INT", {
+                     }
-                    "default": 0,
+                }
                    "min": 0,
                    "max": 0xffffffffffffffff,
                    "control_after_generate": True,
                }),
            }
        }
    def get_noise(self, noise_seed):
        return (Noise_RandomNoise(noise_seed),)
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@@ -1,5 +1,4 @@
 import nodes
 import node_helpers
 import torch
 import comfy.model_management
@@ -39,83 +38,7 @@ class EmptyHunyuanLatentVideo:
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        return ({"samples":latent}, )
 PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
    "1. The main content and theme of the video."
    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
    "4. background environment, light, style and atmosphere."
    "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
    "<|start_header_id|>assistant<|end_header_id|>\n\n"
 )
 class TextEncodeHunyuanVideo_ImageToVideo:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "clip": ("CLIP", ),
            "clip_vision_output": ("CLIP_VISION_OUTPUT", ),
            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
            "image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
            }}
    RETURN_TYPES = ("CONDITIONING",)
    FUNCTION = "encode"
    CATEGORY = "advanced/conditioning"
    def encode(self, clip, clip_vision_output, prompt, image_interleave):
        tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
        return (clip.encode_from_tokens_scheduled(tokens), )
 class HunyuanImageToVideo:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"positive": ("CONDITIONING", ),
                             "vae": ("VAE", ),
                             "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
                             "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
                             "guidance_type": (["v1 (concat)", "v2 (replace)"], )
                },
                "optional": {"start_image": ("IMAGE", ),
                }}
    RETURN_TYPES = ("CONDITIONING", "LATENT")
    RETURN_NAMES = ("positive", "latent")
    FUNCTION = "encode"
    CATEGORY = "conditioning/video_models"
    def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        out_latent = {}
        if start_image is not None:
            start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
            concat_latent_image = vae.encode(start_image)
            mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
            mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
            if guidance_type == "v1 (concat)":
                cond = {"concat_latent_image": concat_latent_image, "concat_mask": mask}
            else:
                cond = {'guiding_frame_index': 0}
                latent[:, :, :concat_latent_image.shape[2]] = concat_latent_image
                out_latent["noise_mask"] = mask
            positive = node_helpers.conditioning_set_values(positive, cond)
        out_latent["samples"] = latent
        return (positive, out_latent)
 NODE_CLASS_MAPPINGS = {
    "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
    "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
    "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
    "HunyuanImageToVideo": HunyuanImageToVideo,
 }
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@@ -1,5 +1,3 @@
 from __future__ import annotations
 import nodes
 import folder_paths
 from comfy.cli_args import args
@@ -11,8 +9,6 @@ import numpy as np
 import json
 import os
 from comfy.comfy_types import FileLocator
 MAX_RESOLUTION = nodes.MAX_RESOLUTION
 class ImageCrop:
@@ -103,7 +99,7 @@ class SaveAnimatedWEBP:
        method = self.methods.get(method)
        filename_prefix += self.prefix_append
        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0])
-        results: list[FileLocator] = []
+        results = list()
        pil_images = []
        for image in images:
            i = 255. * image.cpu().numpy()
--- a/comfy_extras/nodes_load_3d.py
+++ b/comfy_extras/nodes_load_3d.py
@@ -19,6 +19,8 @@ class Load3D():
            "image": ("LOAD_3D", {}),
            "width": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
            "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
            "material": (["original", "normal", "wireframe", "depth"],),
            "up_direction": (["original", "-x", "+x", "-y", "+y", "-z", "+z"],),
        }}
    RETURN_TYPES = ("IMAGE", "MASK", "STRING")
@@ -53,6 +55,8 @@ class Load3DAnimation():
            "image": ("LOAD_3D_ANIMATION", {}),
            "width": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
            "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
            "material": (["original", "normal", "wireframe", "depth"],),
            "up_direction": (["original", "-x", "+x", "-y", "+y", "-z", "+z"],),
        }}
    RETURN_TYPES = ("IMAGE", "MASK", "STRING")
@@ -78,6 +82,8 @@ class Preview3D():
    def INPUT_TYPES(s):
        return {"required": {
            "model_file": ("STRING", {"default": "", "multiline": False}),
            "material": (["original", "normal", "wireframe", "depth"],),
            "up_direction": (["original", "-x", "+x", "-y", "+y", "-z", "+z"],),
        }}
    OUTPUT_NODE = True
@@ -96,6 +102,8 @@ class Preview3DAnimation():
    def INPUT_TYPES(s):
        return {"required": {
            "model_file": ("STRING", {"default": "", "multiline": False}),
            "material": (["original", "normal", "wireframe", "depth"],),
            "up_direction": (["original", "-x", "+x", "-y", "+y", "-z", "+z"],),
        }}
    OUTPUT_NODE = True
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@@ -1,14 +1,9 @@
 import io
 import nodes
 import node_helpers
 import torch
 import comfy.model_management
 import comfy.model_sampling
 import comfy.utils
 import math
 import numpy as np
 import av
 from comfy.ldm.lightricks.symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
 class EmptyLTXVLatentVideo:
    @classmethod
@@ -38,6 +33,7 @@ class LTXVImgToVideo:
                             "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
                             "length": ("INT", {"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8}),
                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
                             "image_noise_scale": ("FLOAT", {"default": 0.15, "min": 0, "max": 1.0, "step": 0.01, "tooltip": "Amount of noise to apply on conditioning image latent."})
                             }}
    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
@@ -46,219 +42,16 @@ class LTXVImgToVideo:
    CATEGORY = "conditioning/video_models"
    FUNCTION = "generate"
-    def generate(self, positive, negative, image, vae, width, height, length, batch_size):
+    def generate(self, positive, negative, image, vae, width, height, length, batch_size, image_noise_scale):
        pixels = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
        encode_pixels = pixels[:, :, :, :3]
        t = vae.encode(encode_pixels)
        positive = node_helpers.conditioning_set_values(positive, {"guiding_latent": t, "guiding_latent_noise_scale": image_noise_scale})
        negative = node_helpers.conditioning_set_values(negative, {"guiding_latent": t, "guiding_latent_noise_scale": image_noise_scale})
        latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
        latent[:, :, :t.shape[2]] = t
-
+        return (positive, negative, {"samples": latent}, )
        conditioning_latent_frames_mask = torch.ones(
            (batch_size, 1, latent.shape[2], 1, 1),
            dtype=torch.float32,
            device=latent.device,
        )
        conditioning_latent_frames_mask[:, :, :t.shape[2]] = 0
        return (positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}, )
 def conditioning_get_any_value(conditioning, key, default=None):
    for t in conditioning:
        if key in t[1]:
            return t[1][key]
    return default
 def get_noise_mask(latent):
    noise_mask = latent.get("noise_mask", None)
    latent_image = latent["samples"]
    if noise_mask is None:
        batch_size, _, latent_length, _, _ = latent_image.shape
        noise_mask = torch.ones(
            (batch_size, 1, latent_length, 1, 1),
            dtype=torch.float32,
            device=latent_image.device,
        )
    else:
        noise_mask = noise_mask.clone()
    return noise_mask
 def get_keyframe_idxs(cond):
    keyframe_idxs = conditioning_get_any_value(cond, "keyframe_idxs", None)
    if keyframe_idxs is None:
        return None, 0
    num_keyframes = torch.unique(keyframe_idxs[:, 0]).shape[0]
    return keyframe_idxs, num_keyframes
 class LTXVAddGuide:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"positive": ("CONDITIONING", ),
                             "negative": ("CONDITIONING", ),
                             "vae": ("VAE",),
                             "latent": ("LATENT",),
                             "image": ("IMAGE", {"tooltip": "Image or video to condition the latent video on. Must be 8*n + 1 frames."
                                                 "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames."}),
                             "frame_idx": ("INT", {"default": 0, "min": -9999, "max": 9999,
                                                   "tooltip": "Frame index to start the conditioning at. For single-frame images or "
                                                   "videos with 1-8 frames, any frame_idx value is acceptable. For videos with 9+ "
                                                   "frames, frame_idx must be divisible by 8, otherwise it will be rounded down to "
                                                   "the nearest multiple of 8. Negative values are counted from the end of the video."}),
                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
                             }
            }
    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
    RETURN_NAMES = ("positive", "negative", "latent")
    CATEGORY = "conditioning/video_models"
    FUNCTION = "generate"
    def __init__(self):
        self._num_prefix_frames = 2
        self._patchifier = SymmetricPatchifier(1)
    def encode(self, vae, latent_width, latent_height, images, scale_factors):
        time_scale_factor, width_scale_factor, height_scale_factor = scale_factors
        images = images[:(images.shape[0] - 1) // time_scale_factor * time_scale_factor + 1]
        pixels = comfy.utils.common_upscale(images.movedim(-1, 1), latent_width * width_scale_factor, latent_height * height_scale_factor, "bilinear", crop="disabled").movedim(1, -1)
        encode_pixels = pixels[:, :, :, :3]
        t = vae.encode(encode_pixels)
        return encode_pixels, t
    def get_latent_index(self, cond, latent_length, guide_length, frame_idx, scale_factors):
        time_scale_factor, _, _ = scale_factors
        _, num_keyframes = get_keyframe_idxs(cond)
        latent_count = latent_length - num_keyframes
        frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * time_scale_factor + 1 + frame_idx, 0)
        if guide_length > 1:
            frame_idx = frame_idx // time_scale_factor * time_scale_factor # frame index must be divisible by 8
        latent_idx = (frame_idx + time_scale_factor - 1) // time_scale_factor
        return frame_idx, latent_idx
    def add_keyframe_index(self, cond, frame_idx, guiding_latent, scale_factors):
        keyframe_idxs, _ = get_keyframe_idxs(cond)
        _, latent_coords = self._patchifier.patchify(guiding_latent)
        pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, True)
        pixel_coords[:, 0] += frame_idx
        if keyframe_idxs is None:
            keyframe_idxs = pixel_coords
        else:
            keyframe_idxs = torch.cat([keyframe_idxs, pixel_coords], dim=2)
        return node_helpers.conditioning_set_values(cond, {"keyframe_idxs": keyframe_idxs})
    def append_keyframe(self, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors):
        positive = self.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors)
        negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors)
        mask = torch.full(
            (noise_mask.shape[0], 1, guiding_latent.shape[2], 1, 1),
            1.0 - strength,
            dtype=noise_mask.dtype,
            device=noise_mask.device,
        )
        latent_image = torch.cat([latent_image, guiding_latent], dim=2)
        noise_mask = torch.cat([noise_mask, mask], dim=2)
        return positive, negative, latent_image, noise_mask
    def replace_latent_frames(self, latent_image, noise_mask, guiding_latent, latent_idx, strength):
        cond_length = guiding_latent.shape[2]
        assert latent_image.shape[2] >= latent_idx + cond_length, "Conditioning frames exceed the length of the latent sequence."
        mask = torch.full(
            (noise_mask.shape[0], 1, cond_length, 1, 1),
            1.0 - strength,
            dtype=noise_mask.dtype,
            device=noise_mask.device,
        )
        latent_image = latent_image.clone()
        noise_mask = noise_mask.clone()
        latent_image[:, :, latent_idx : latent_idx + cond_length] = guiding_latent
        noise_mask[:, :, latent_idx : latent_idx + cond_length] = mask
        return latent_image, noise_mask
    def generate(self, positive, negative, vae, latent, image, frame_idx, strength):
        scale_factors = vae.downscale_index_formula
        latent_image = latent["samples"]
        noise_mask = get_noise_mask(latent)
        _, _, latent_length, latent_height, latent_width = latent_image.shape
        image, t = self.encode(vae, latent_width, latent_height, image, scale_factors)
        frame_idx, latent_idx = self.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors)
        assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence."
        num_prefix_frames = min(self._num_prefix_frames, t.shape[2])
        positive, negative, latent_image, noise_mask = self.append_keyframe(
            positive,
            negative,
            frame_idx,
            latent_image,
            noise_mask,
            t[:, :, :num_prefix_frames],
            strength,
            scale_factors,
        )
        latent_idx += num_prefix_frames
        t = t[:, :, num_prefix_frames:]
        if t.shape[2] == 0:
            return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
        latent_image, noise_mask = self.replace_latent_frames(
            latent_image,
            noise_mask,
            t,
            latent_idx,
            strength,
        )
        return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
 class LTXVCropGuides:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"positive": ("CONDITIONING", ),
                             "negative": ("CONDITIONING", ),
                             "latent": ("LATENT",),
                             }
            }
    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
    RETURN_NAMES = ("positive", "negative", "latent")
    CATEGORY = "conditioning/video_models"
    FUNCTION = "crop"
    def __init__(self):
        self._patchifier = SymmetricPatchifier(1)
    def crop(self, positive, negative, latent):
        latent_image = latent["samples"].clone()
        noise_mask = get_noise_mask(latent)
        _, num_keyframes = get_keyframe_idxs(positive)
        if num_keyframes == 0:
            return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
        latent_image = latent_image[:, :, :-num_keyframes]
        noise_mask = noise_mask[:, :, :-num_keyframes]
        positive = node_helpers.conditioning_set_values(positive, {"keyframe_idxs": None})
        negative = node_helpers.conditioning_set_values(negative, {"keyframe_idxs": None})
        return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
 class LTXVConditioning:
@@ -381,77 +174,6 @@ class LTXVScheduler:
        return (sigmas,)
 def encode_single_frame(output_file, image_array: np.ndarray, crf):
    container = av.open(output_file, "w", format="mp4")
    try:
        stream = container.add_stream(
            "h264", rate=1, options={"crf": str(crf), "preset": "veryfast"}
        )
        stream.height = image_array.shape[0]
        stream.width = image_array.shape[1]
        av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(
            format="yuv420p"
        )
        container.mux(stream.encode(av_frame))
        container.mux(stream.encode())
    finally:
        container.close()
 def decode_single_frame(video_file):
    container = av.open(video_file)
    try:
        stream = next(s for s in container.streams if s.type == "video")
        frame = next(container.decode(stream))
    finally:
        container.close()
    return frame.to_ndarray(format="rgb24")
 def preprocess(image: torch.Tensor, crf=29):
    if crf == 0:
        return image
    image_array = (image[:(image.shape[0] // 2) * 2, :(image.shape[1] // 2) * 2] * 255.0).byte().cpu().numpy()
    with io.BytesIO() as output_file:
        encode_single_frame(output_file, image_array, crf)
        video_bytes = output_file.getvalue()
    with io.BytesIO(video_bytes) as video_file:
        image_array = decode_single_frame(video_file)
    tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0
    return tensor
 class LTXVPreprocess:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "image": ("IMAGE",),
                "img_compression": (
                    "INT",
                    {
                        "default": 35,
                        "min": 0,
                        "max": 100,
                        "tooltip": "Amount of compression to apply on image.",
                    },
                ),
            }
        }
    FUNCTION = "preprocess"
    RETURN_TYPES = ("IMAGE",)
    RETURN_NAMES = ("output_image",)
    CATEGORY = "image"
    def preprocess(self, image, img_compression):
        if img_compression > 0:
            output_images = []
            for i in range(image.shape[0]):
                output_images.append(preprocess(image[i], img_compression))
        return (torch.stack(output_images),)
 NODE_CLASS_MAPPINGS = {
    "EmptyLTXVLatentVideo": EmptyLTXVLatentVideo,
@@ -459,7 +181,4 @@ NODE_CLASS_MAPPINGS = {
    "ModelSamplingLTXV": ModelSamplingLTXV,
    "LTXVConditioning": LTXVConditioning,
    "LTXVScheduler": LTXVScheduler,
    "LTXVAddGuide": LTXVAddGuide,
    "LTXVPreprocess": LTXVPreprocess,
    "LTXVCropGuides": LTXVCropGuides,
 }
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -1,12 +1,9 @@
 from __future__ import annotations
 import os
 import av
 import torch
 import folder_paths
 import json
 from fractions import Fraction
 from comfy.comfy_types import FileLocator
 class SaveWEBM:
@@ -65,7 +62,7 @@ class SaveWEBM:
        container.mux(stream.encode())
        container.close()
-        results: list[FileLocator] = [{
+        results = [{
            "filename": file,
            "subfolder": subfolder,
            "type": self.type
--- a/comfy_extras/nodes_video_model.py
+++ b/comfy_extras/nodes_video_model.py
@@ -4,7 +4,6 @@ import comfy.utils
 import comfy.sd
 import folder_paths
 import comfy_extras.nodes_model_merging
 import node_helpers
 class ImageOnlyCheckpointLoader:
@@ -122,38 +121,12 @@ class ImageOnlyCheckpointSave(comfy_extras.nodes_model_merging.CheckpointSave):
        comfy_extras.nodes_model_merging.save_checkpoint(model, clip_vision=clip_vision, vae=vae, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
        return {}
 class ConditioningSetAreaPercentageVideo:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {"conditioning": ("CONDITIONING", ),
                             "width": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
                             "height": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
                             "temporal": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
                             "x": ("FLOAT", {"default": 0, "min": 0, "max": 1.0, "step": 0.01}),
                             "y": ("FLOAT", {"default": 0, "min": 0, "max": 1.0, "step": 0.01}),
                             "z": ("FLOAT", {"default": 0, "min": 0, "max": 1.0, "step": 0.01}),
                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
                             }}
    RETURN_TYPES = ("CONDITIONING",)
    FUNCTION = "append"
    CATEGORY = "conditioning"
    def append(self, conditioning, width, height, temporal, x, y, z, strength):
        c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x),
                                                                "strength": strength,
                                                                "set_area_to_bounds": False})
        return (c, )
 NODE_CLASS_MAPPINGS = {
    "ImageOnlyCheckpointLoader": ImageOnlyCheckpointLoader,
    "SVD_img2vid_Conditioning": SVD_img2vid_Conditioning,
    "VideoLinearCFGGuidance": VideoLinearCFGGuidance,
    "VideoTriangleCFGGuidance": VideoTriangleCFGGuidance,
    "ImageOnlyCheckpointSave": ImageOnlyCheckpointSave,
    "ConditioningSetAreaPercentageVideo": ConditioningSetAreaPercentageVideo,
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.26"
+__version__ = "0.3.18"
--- a/execution.py
+++ b/execution.py
@@ -634,13 +634,6 @@ def validate_inputs(prompt, item, validated):
                continue
        else:
            try:
                # Unwraps values wrapped in __value__ key. This is used to pass
                # list widget value to execution, as by default list value is
                # reserved to represent the connection between nodes.
                if isinstance(val, dict) and "__value__" in val:
                    val = val["__value__"]
                    inputs[x] = val
                if type_input == "INT":
                    val = int(val)
                    inputs[x] = val
--- a/main.py
+++ b/main.py
@@ -139,7 +139,6 @@ from server import BinaryEventTypes
 import nodes
 import comfy.model_management
 import comfyui_version
 import app.logger
 def cuda_malloc_warning():
@@ -296,12 +295,9 @@ def start_comfyui(asyncio_loop=None):
 if __name__ == "__main__":
    # Running directly, just start ComfyUI.
    logging.info("ComfyUI version: {}".format(comfyui_version.__version__))
    event_loop, _, start_all_func = start_comfyui()
    try:
-        x = start_all_func()
+        event_loop.run_until_complete(start_all_func())
        app.logger.print_startup_warnings()
        event_loop.run_until_complete(x)
    except KeyboardInterrupt:
        logging.info("\nStopped server")
--- a/nodes.py
+++ b/nodes.py
@@ -25,7 +25,7 @@ import comfy.sample
 import comfy.sd
 import comfy.utils
 import comfy.controlnet
-from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator
+from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict
 import comfy.clip_vision
@@ -479,7 +479,7 @@ class SaveLatent:
        file = f"{filename}_{counter:05}_.latent"
-        results: list[FileLocator] = []
+        results = list()
        results.append({
            "filename": file,
            "subfolder": subfolder,
@@ -489,7 +489,7 @@ class SaveLatent:
        file = os.path.join(full_output_folder, file)
        output = {}
-        output["latent_tensor"] = samples["samples"].contiguous()
+        output["latent_tensor"] = samples["samples"]
        output["latent_format_version_0"] = torch.tensor([])
        comfy.utils.save_torch_file(output, file, metadata=metadata)
@@ -1519,7 +1519,7 @@ class KSampler:
        return {
            "required": {
                "model": ("MODEL", {"tooltip": "The model used for denoising the input latent."}),
-                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True, "tooltip": "The random seed used for creating the noise."}),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "tooltip": "The random seed used for creating the noise."}),
                "steps": ("INT", {"default": 20, "min": 1, "max": 10000, "tooltip": "The number of steps used in the denoising process."}),
                "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01, "tooltip": "The Classifier-Free Guidance scale balances creativity and adherence to the prompt. Higher values result in images more closely matching the prompt however too high values will negatively impact quality."}),
                "sampler_name": (comfy.samplers.KSampler.SAMPLERS, {"tooltip": "The algorithm used when sampling, this can affect the quality, speed, and style of the generated output."}),
@@ -1547,7 +1547,7 @@ class KSamplerAdvanced:
        return {"required":
                    {"model": ("MODEL",),
                    "add_noise": (["enable", "disable"], ),
-                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True}),
+                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
                    "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
                    "sampler_name": (comfy.samplers.KSampler.SAMPLERS, ),
@@ -1785,7 +1785,14 @@ class LoadImageOutput(LoadImage):
    DESCRIPTION = "Load an image from the output folder. When the refresh button is clicked, the node will update the image list and automatically select the first image, allowing for easy iteration."
    EXPERIMENTAL = True
-    FUNCTION = "load_image"
+    FUNCTION = "load_image_output"
    def load_image_output(self, image):
        return self.load_image(f"{image} [output]")
    @classmethod
    def VALIDATE_INPUTS(s, image):
        return True
 class ImageScale:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,79 +1,9 @@
 [project]
 name = "ComfyUI"
-version = "0.3.26"
+version = "0.3.18"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
 dependencies = [
    "torchsde",
    "numpy>=1.25.0",
    "einops",
    "transformers>=4.28.1",
    "tokenizers>=0.13.3",
    "sentencepiece",
    "safetensors>=0.4.2",
    "aiohttp",
    "pyyaml",
    "Pillow",
    "scipy",
    "tqdm",
    "psutil",
    # Optional dependencies
    "kornia>=0.7.1",
    "spandrel",
    "soundfile",
    "comfyui-manager",
 ]
 [project.optional-dependencies]
 cpu = [
  "torch",
  "torchvision",
  "torchaudio",
 ]
 cu126 = [
  "torch",
  "torchvision",
  "torchaudio",
 ]
 cu124 = [
  "torch",
  "torchvision",
  "torchaudio",
 ]
 cu118 = [
  "torch",
  "torchvision",
  "torchaudio",
 ]
 rocm = [
  "torch",
  "torchvision",
  "torchaudio",
 ]
 xpus = [
  "torch",
  "torchvision",
  "torchaudio",
 ]
 [tool.uv]
 conflicts = [
  [
    { extra = "cpu" },
    { extra = "cu126" },
    { extra = "cu124" },
    { extra = "cu118" },
    { extra = "rocm" },
    { extra = "xpus" },
  ],
 ]
 [project.urls]
 homepage = "https://www.comfy.org/"
@@ -91,61 +21,3 @@ lint.select = [
  "F",
 ]
 exclude = ["*.ipynb"]
 [tool.uv.sources]
 comfyui-manager = { path = "custom_nodes/ComfyUI-Manager" }
 torch = [
  { index = "pytorch-cu126", extra = "cu126" },
  { index = "pytorch-cu124", extra = "cu124" },
  { index = "pytorch-cu118", extra = "cu118" },
  { index = "pytorch-cpu", extra = "cpu" },
  { index = "pytorch-rocm", extra = "rocm" },
  { index = "pytorch-xpu", extra = "xpus" },
 ]
 torchvision = [
    { index = "pytorch-cu126", extra = "cu126" },
    { index = "pytorch-cu124", extra = "cu124" },
    { index = "pytorch-cu118", extra = "cu118" },
    { index = "pytorch-cpu", extra = "cpu" },
    { index = "pytorch-rocm", extra = "rocm" },
    { index = "pytorch-xpu", extra = "xpus" },
 ]
 torchaudio = [
  { index = "pytorch-cu126", extra = "cu126" },
  { index = "pytorch-cu124", extra = "cu124" },
  { index = "pytorch-cu118", extra = "cu118" },
  { index = "pytorch-cpu", extra = "cpu" },
  { index = "pytorch-rocm", extra = "rocm" },
  { index = "pytorch-xpu", extra = "xpus" },
 ]
 [[tool.uv.index]]
 name = "pytorch-cu126"
 url = "https://download.pytorch.org/whl/cu126"
 explicit = true
 [[tool.uv.index]]
 name = "pytorch-cu124"
 url = "https://download.pytorch.org/whl/cu124"
 explicit = true
 [[tool.uv.index]]
 name = "pytorch-cu118"
 url = "https://download.pytorch.org/whl/cu118"
 explicit = true
 [[tool.uv.index]]
 name = "pytorch-cpu"
 url = "https://download.pytorch.org/whl/cpu"
 explicit = true
 [[tool.uv.index]]
 name = "pytorch-rocm"
 url = "https://download.pytorch.org/whl/rocm6.2"
 explicit = true
 [[tool.uv.index]]
 name = "pytorch-xpu"
 url = "https://download.pytorch.org/whl/xpu"
 explicit = true
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.11.8
+comfyui-frontend-package==1.10.17
 torch
 torchsde
 torchvision
--- a/tests-unit/app_test/frontend_manager_test.py
+++ b/tests-unit/app_test/frontend_manager_test.py
@@ -70,7 +70,7 @@ def test_get_release_invalid_version(mock_provider):
 def test_init_frontend_default():
    version_string = DEFAULT_VERSION_STRING
    frontend_path = FrontendManager.init_frontend(version_string)
-    assert frontend_path == FrontendManager.default_frontend_path()
+    assert frontend_path == FrontendManager.DEFAULT_FRONTEND_PATH
 def test_init_frontend_invalid_version():
@@ -84,29 +84,24 @@ def test_init_frontend_invalid_provider():
    with pytest.raises(HTTPError):
        FrontendManager.init_frontend_unsafe(version_string)
@pytest.fixture
 def mock_os_functions():
-    with (
+    with patch('app.frontend_management.os.makedirs') as mock_makedirs, \
-        patch("app.frontend_management.os.makedirs") as mock_makedirs,
+         patch('app.frontend_management.os.listdir') as mock_listdir, \
-        patch("app.frontend_management.os.listdir") as mock_listdir,
+         patch('app.frontend_management.os.rmdir') as mock_rmdir:
        patch("app.frontend_management.os.rmdir") as mock_rmdir,
    ):
        mock_listdir.return_value = []  # Simulate empty directory
        yield mock_makedirs, mock_listdir, mock_rmdir
@pytest.fixture
 def mock_download():
-    with patch("app.frontend_management.download_release_asset_zip") as mock:
+    with patch('app.frontend_management.download_release_asset_zip') as mock:
        mock.side_effect = Exception("Download failed")  # Simulate download failure
        yield mock
 def test_finally_block(mock_os_functions, mock_download, mock_provider):
    # Arrange
    mock_makedirs, mock_listdir, mock_rmdir = mock_os_functions
-    version_string = "test-owner/test-repo@1.0.0"
+    version_string = 'test-owner/test-repo@1.0.0'
    # Act & Assert
    with pytest.raises(Exception):
@@ -133,42 +128,3 @@ def test_parse_version_string_invalid():
    version_string = "invalid"
    with pytest.raises(argparse.ArgumentTypeError):
        FrontendManager.parse_version_string(version_string)
 def test_init_frontend_default_with_mocks():
    # Arrange
    version_string = DEFAULT_VERSION_STRING
    # Act
    with (
        patch("app.frontend_management.check_frontend_version") as mock_check,
        patch.object(
            FrontendManager, "default_frontend_path", return_value="/mocked/path"
        ),
    ):
        frontend_path = FrontendManager.init_frontend(version_string)
    # Assert
    assert frontend_path == "/mocked/path"
    mock_check.assert_called_once()
 def test_init_frontend_fallback_on_error():
    # Arrange
    version_string = "test-owner/test-repo@1.0.0"
    # Act
    with (
        patch.object(
            FrontendManager, "init_frontend_unsafe", side_effect=Exception("Test error")
        ),
        patch("app.frontend_management.check_frontend_version") as mock_check,
        patch.object(
            FrontendManager, "default_frontend_path", return_value="/default/path"
        ),
    ):
        frontend_path = FrontendManager.init_frontend(version_string)
    # Assert
    assert frontend_path == "/default/path"
    mock_check.assert_called_once()
		`@@ -1,2 +0,0 @@`
			`.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation`
			`pause`