Add pre-commit configuration and update README for backend development

2025-03-02 12:43:19 -08:00
43 changed files with 313 additions and 1093 deletions
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -22,7 +22,7 @@ on:
        description: 'Python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"


 jobs:
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "9"
+        default: "8"
 #  push:
 #    branches:
 #      - master
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
+repos:
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.0.241  # Use the desired version of Ruff
+    hooks:
+      - id: ruff
+
+  - repo: local
+    hooks:
+      - id: pytest
+        name: Run Pytest
+        entry: pytest
+        language: system
+        types: [python] 
--- a/README.md
+++ b/README.md
@@ -330,6 +330,25 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w

 See also: [https://www.comfy.org/](https://www.comfy.org/)

+## ComfyUI Backend Development
+
+### Setup Environment
+
+Install pre-commit to run tests and linters
+
+```
+pip install pre-commit
+```
+
+```
+pre-commit install
+```
+
+
+### Reporting Issues and Requesting Features
+
+For any bugs, issues, or feature requests related to the backend, please use the [ComfyUI repository](https://github.com/comfyanonymous/ComfyUI). This will help us manage and address backend-specific concerns more efficiently.
+
 ## Frontend Development

 As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -3,7 +3,6 @@ import argparse
 import logging
 import os
 import re
-import sys
 import tempfile
 import zipfile
 import importlib
@@ -20,11 +19,10 @@ from comfy.cli_args import DEFAULT_VERSION_STRING

 try:
    import comfyui_frontend_package
-except ImportError:
+except ImportError as e:
    # TODO: Remove the check after roll out of 0.3.16
-    req_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'requirements.txt'))
-    logging.error(f"\n\n********** ERROR ***********\n\ncomfyui-frontend-package is not installed. Please install the updated requirements.txt file by running:\n{sys.executable} -m pip install -r {req_path}\n\nThis error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.\n\nIf you are on the portable package you can run: update\\update_comfyui.bat to solve this problem\n********** ERROR **********\n")
-    exit(-1)
+    logging.error("comfyui-frontend-package is not installed. Please install the updated requirements.txt file by running: pip install -r requirements.txt")
+    raise e


 REQUEST_TIMEOUT = 10  # seconds
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -1,6 +1,7 @@
 import argparse
 import enum
 import os
+from typing import Optional
 import comfy.options


@@ -165,14 +166,13 @@ parser.add_argument(
    """,
 )

-def is_valid_directory(path: str) -> str:
-    """Validate if the given path is a directory, and check permissions."""
-    if not os.path.exists(path):
-        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
+def is_valid_directory(path: Optional[str]) -> Optional[str]:
+    """Validate if the given path is a directory."""
+    if path is None:
+        return None
+
    if not os.path.isdir(path):
-        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
-    if not os.access(path, os.R_OK):
-        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
+        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
    return path

 parser.add_argument(
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@@ -97,12 +97,8 @@ class CLIPTextModel_(torch.nn.Module):
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
-        if embeds is not None:
-            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
-        else:
-            x = self.embeddings(input_tokens, dtype=dtype)
-
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -120,10 +116,7 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)

-        if num_tokens is not None:
-            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
-        else:
-            pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output

 class CLIPTextModel(torch.nn.Module):
@@ -211,15 +204,6 @@ class CLIPVision(torch.nn.Module):
            pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output

-class LlavaProjector(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, dtype, device, operations):
-        super().__init__()
-        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
-        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
-
 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
@@ -229,16 +213,7 @@ class CLIPVisionModelProjection(torch.nn.Module):
        else:
            self.visual_projection = lambda a: a

-        if "llava3" == config_dict.get("projector_type", None):
-            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
-        else:
-            self.multi_modal_projector = None
-
    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
        out = self.visual_projection(x[2])
-        projected = None
-        if self.multi_modal_projector is not None:
-            projected = self.multi_modal_projector(x[1])
-
-        return (x[0], x[1], out, projected)
+        return (x[0], x[1], out)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -65,7 +65,6 @@ class ClipVisionModel():
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
-        outputs["mm_projected"] = out[3]
        return outputs

 def convert_to_transformers(sd, prefix):
@@ -105,10 +104,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
-            if "multi_modal_projector.linear_1.bias" in sd:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
-            else:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
    else:
--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@@ -1,19 +0,0 @@
-{
-  "attention_dropout": 0.0,
-  "dropout": 0.0,
-  "hidden_act": "quick_gelu",
-  "hidden_size": 1024,
-  "image_size": 336,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-5,
-  "model_type": "clip_vision_model",
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 24,
-  "patch_size": 14,
-  "projection_dim": 768,
-  "projector_type": "llava3",
-  "torch_dtype": "float32"
-}
--- a/comfy/comfy_types/init.py
+++ b/comfy/comfy_types/init.py
@@ -1,6 +1,6 @@
 import torch
 from typing import Callable, Protocol, TypedDict, Optional, List
-from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin, FileLocator
+from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin


 class UnetApplyFunction(Protocol):
@@ -42,5 +42,4 @@ __all__ = [
    InputTypeDict.__name__,
    ComfyNodeABC.__name__,
    CheckLazyMixin.__name__,
-    FileLocator.__name__,
 ]
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@@ -134,8 +134,6 @@ class InputTypeOptions(TypedDict):
    """
    remote: RemoteInputOptions
    """Specifies the configuration for a remote input."""
-    control_after_generate: bool
-    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""


 class HiddenInputTypeDict(TypedDict):
@@ -295,14 +293,3 @@ class CheckLazyMixin:

        need = [name for name in kwargs if kwargs[name] is None]
        return need
-
-
-class FileLocator(TypedDict):
-    """Provides type hinting for the file location"""
-
-    filename: str
-    """The filename of the file."""
-    subfolder: str
-    """The subfolder of the file."""
-    type: Literal["input", "output", "temp"]
-    """The root folder of the file."""
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@@ -7,7 +7,7 @@ from einops import rearrange
 import math
 from typing import Dict, Optional, Tuple

-from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
+from .symmetric_patchifier import SymmetricPatchifier


 def get_timestep_embedding(
@@ -377,16 +377,12 @@ class LTXVModel(torch.nn.Module):

                 positional_embedding_theta=10000.0,
                 positional_embedding_max_pos=[20, 2048, 2048],
-                 causal_temporal_positioning=False,
-                 vae_scale_factors=(8, 32, 32),
                 dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.generator = None
-        self.vae_scale_factors = vae_scale_factors
        self.dtype = dtype
        self.out_channels = in_channels
        self.inner_dim = num_attention_heads * attention_head_dim
-        self.causal_temporal_positioning = causal_temporal_positioning

        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)

@@ -420,23 +416,42 @@ class LTXVModel(torch.nn.Module):

        self.patchifier = SymmetricPatchifier(1)

-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, guiding_latent_noise_scale=0, transformer_options={}, **kwargs):
        patches_replace = transformer_options.get("patches_replace", {})

+        indices_grid = self.patchifier.get_grid(
+            orig_num_frames=x.shape[2],
+            orig_height=x.shape[3],
+            orig_width=x.shape[4],
+            batch_size=x.shape[0],
+            scale_grid=((1 / frame_rate) * 8, 32, 32),
+            device=x.device,
+        )
+
+        if guiding_latent is not None:
+            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
+            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
+            ts *= input_ts
+            ts[:, :, 0] = guiding_latent_noise_scale * (input_ts[:, :, 0] ** 2)
+            timestep = self.patchifier.patchify(ts)
+            input_x = x.clone()
+            x[:, :, 0] = guiding_latent[:, :, 0]
+            if guiding_latent_noise_scale > 0:
+                if self.generator is None:
+                    self.generator = torch.Generator(device=x.device).manual_seed(42)
+                elif self.generator.device != x.device:
+                    self.generator = torch.Generator(device=x.device).set_state(self.generator.get_state())
+
+                noise_shape = [guiding_latent.shape[0], guiding_latent.shape[1], 1, guiding_latent.shape[3], guiding_latent.shape[4]]
+                scale = guiding_latent_noise_scale * (input_ts ** 2)
+                guiding_noise = scale * torch.randn(size=noise_shape, device=x.device, generator=self.generator)
+
+                x[:, :, 0] = guiding_noise[:, :, 0] + x[:, :, 0] *  (1.0 - scale[:, :, 0])
+
+
        orig_shape = list(x.shape)

-        x, latent_coords = self.patchifier.patchify(x)
-        pixel_coords = latent_to_pixel_coords(
-            latent_coords=latent_coords,
-            scale_factors=self.vae_scale_factors,
-            causal_fix=self.causal_temporal_positioning,
-        )
-
-        if keyframe_idxs is not None:
-            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
-
-        fractional_coords = pixel_coords.to(torch.float32)
-        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
+        x = self.patchifier.patchify(x)

        x = self.patchify_proj(x)
        timestep = timestep * 1000.0
@@ -444,7 +459,7 @@ class LTXVModel(torch.nn.Module):
        if attention_mask is not None and not torch.is_floating_point(attention_mask):
            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max

-        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)
+        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)

        batch_size = x.shape[0]
        timestep, embedded_timestep = self.adaln_single(
@@ -504,4 +519,8 @@ class LTXVModel(torch.nn.Module):
            out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
        )

+        if guiding_latent is not None:
+            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
+
+        # print("res", x)
        return x
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@@ -6,29 +6,16 @@ from einops import rearrange
 from torch import Tensor


-def latent_to_pixel_coords(
-    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
-) -> Tensor:
-    """
-    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
-    configuration.
-    Args:
-        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
-        containing the latent corner coordinates of each token.
-        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
-        causal_fix (bool): Whether to take into account the different temporal scale
-            of the first frame. Default = False for backwards compatibility.
-    Returns:
-        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
-    """
-    pixel_coords = (
-        latent_coords
-        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
-    )
-    if causal_fix:
-        # Fix temporal scale for first frame to 1 due to causality
-        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
-    return pixel_coords
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    elif dims_to_append == 0:
+        return x
+    return x[(...,) + (None,) * dims_to_append]


 class Patchifier(ABC):
@@ -57,26 +44,29 @@ class Patchifier(ABC):
    def patch_size(self):
        return self._patch_size

-    def get_latent_coords(
-        self, latent_num_frames, latent_height, latent_width, batch_size, device
+    def get_grid(
+        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
    ):
-        """
-        Return a tensor of shape [batch_size, 3, num_patches] containing the
-            top-left corner latent coordinates of each latent patch.
-        The tensor is repeated for each batch element.
-        """
-        latent_sample_coords = torch.meshgrid(
-            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
-            torch.arange(0, latent_height, self._patch_size[1], device=device),
-            torch.arange(0, latent_width, self._patch_size[2], device=device),
-            indexing="ij",
-        )
-        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
-        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-        latent_coords = rearrange(
-            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
-        )
-        return latent_coords
+        f = orig_num_frames // self._patch_size[0]
+        h = orig_height // self._patch_size[1]
+        w = orig_width // self._patch_size[2]
+        grid_h = torch.arange(h, dtype=torch.float32, device=device)
+        grid_w = torch.arange(w, dtype=torch.float32, device=device)
+        grid_f = torch.arange(f, dtype=torch.float32, device=device)
+        grid = torch.meshgrid(grid_f, grid_h, grid_w, indexing='ij')
+        grid = torch.stack(grid, dim=0)
+        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+
+        if scale_grid is not None:
+            for i in range(3):
+                if isinstance(scale_grid[i], Tensor):
+                    scale = append_dims(scale_grid[i], grid.ndim - 1)
+                else:
+                    scale = scale_grid[i]
+                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
+
+        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
+        return grid


 class SymmetricPatchifier(Patchifier):
@@ -84,8 +74,6 @@ class SymmetricPatchifier(Patchifier):
        self,
        latents: Tensor,
    ) -> Tuple[Tensor, Tensor]:
-        b, _, f, h, w = latents.shape
-        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
        latents = rearrange(
            latents,
            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
@@ -93,7 +81,7 @@ class SymmetricPatchifier(Patchifier):
            p2=self._patch_size[1],
            p3=self._patch_size[2],
        )
-        return latents, latent_coords
+        return latents

    def unpatchify(
        self,
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@@ -15,7 +15,6 @@ class CausalConv3d(nn.Module):
        stride: Union[int, Tuple[int]] = 1,
        dilation: int = 1,
        groups: int = 1,
-        spatial_padding_mode: str = "zeros",
        **kwargs,
    ):
        super().__init__()
@@ -39,7 +38,7 @@ class CausalConv3d(nn.Module):
            stride=stride,
            dilation=dilation,
            padding=padding,
-            padding_mode=spatial_padding_mode,
+            padding_mode="zeros",
            groups=groups,
        )

--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -1,15 +1,13 @@
-from __future__ import annotations
 import torch
 from torch import nn
 from functools import partial
 import math
 from einops import rearrange
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
-
 ops = comfy.ops.disable_weight_init

 class Encoder(nn.Module):
@@ -34,7 +32,7 @@ class Encoder(nn.Module):
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        latent_log_var (`str`, *optional*, defaults to `per_channel`):
-            The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
    """

    def __init__(
@@ -42,13 +40,12 @@ class Encoder(nn.Module):
        dims: Union[int, Tuple[int, int]] = 3,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        norm_num_groups: int = 32,
        patch_size: Union[int, Tuple[int]] = 1,
        norm_layer: str = "group_norm",  # group_norm, pixel_norm
        latent_log_var: str = "per_channel",
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -68,7 +65,6 @@ class Encoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        self.down_blocks = nn.ModuleList([])
@@ -86,7 +82,6 @@ class Encoder(nn.Module):
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -97,7 +92,6 @@ class Encoder(nn.Module):
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = make_conv_nd(
@@ -107,7 +101,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 1, 1),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = make_conv_nd(
@@ -117,7 +110,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(1, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                block = make_conv_nd(
@@ -127,7 +119,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@@ -138,34 +129,6 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_all_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(2, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_space_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(1, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
-                )
-            elif block_name == "compress_time_res":
-                output_channel = block_params.get("multiplier", 2) * output_channel
-                block = SpaceToDepthDownsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    stride=(2, 1, 1),
-                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown block: {block_name}")
@@ -189,18 +152,10 @@ class Encoder(nn.Module):
            conv_out_channels *= 2
        elif latent_log_var == "uniform":
            conv_out_channels += 1
-        elif latent_log_var == "constant":
-            conv_out_channels += 1
        elif latent_log_var != "none":
            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
        self.conv_out = make_conv_nd(
-            dims,
-            output_channel,
-            conv_out_channels,
-            3,
-            padding=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False
@@ -242,15 +197,6 @@ class Encoder(nn.Module):
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            else:
                raise ValueError(f"Invalid input shape: {sample.shape}")
-        elif self.latent_log_var == "constant":
-            sample = sample[:, :-1, ...]
-            approx_ln_0 = (
-                -30
-            )  # this is the minimal clamp value in DiagonalGaussianDistribution objects
-            sample = torch.cat(
-                [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
-                dim=1,
-            )

        return sample

@@ -285,7 +231,7 @@ class Decoder(nn.Module):
        dims,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        blocks=[("res_x", 1)],
        base_channels: int = 128,
        layers_per_block: int = 2,
        norm_num_groups: int = 32,
@@ -293,7 +239,6 @@ class Decoder(nn.Module):
        norm_layer: str = "group_norm",
        causal: bool = True,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@@ -319,7 +264,6 @@ class Decoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        self.up_blocks = nn.ModuleList([])
@@ -339,7 +283,6 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "attn_res_x":
                block = UNetMidBlock3D(
@@ -351,7 +294,6 @@ class Decoder(nn.Module):
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=timestep_conditioning,
                    attention_head_dim=block_params["attention_head_dim"],
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
@@ -364,21 +306,14 @@ class Decoder(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=block_params.get("inject_noise", False),
                    timestep_conditioning=False,
-                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = DepthToSpaceUpsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    stride=(2, 1, 1),
-                    spatial_padding_mode=spatial_padding_mode,
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
                )
            elif block_name == "compress_space":
                block = DepthToSpaceUpsample(
-                    dims=dims,
-                    in_channels=input_channel,
-                    stride=(1, 2, 2),
-                    spatial_padding_mode=spatial_padding_mode,
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
                )
            elif block_name == "compress_all":
                output_channel = output_channel // block_params.get("multiplier", 1)
@@ -388,7 +323,6 @@ class Decoder(nn.Module):
                    stride=(2, 2, 2),
                    residual=block_params.get("residual", False),
                    out_channels_reduction_factor=block_params.get("multiplier", 1),
-                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown layer: {block_name}")
@@ -406,13 +340,7 @@ class Decoder(nn.Module):

        self.conv_act = nn.SiLU()
        self.conv_out = make_conv_nd(
-            dims,
-            output_channel,
-            out_channels,
-            3,
-            padding=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
+            dims, output_channel, out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False
@@ -505,12 +433,6 @@ class UNetMidBlock3D(nn.Module):
        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
        resnet_groups (`int`, *optional*, defaults to 32):
            The number of groups to use in the group normalization layers of the resnet blocks.
-        norm_layer (`str`, *optional*, defaults to `group_norm`):
-            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
-        inject_noise (`bool`, *optional*, defaults to `False`):
-            Whether to inject noise into the hidden states.
-        timestep_conditioning (`bool`, *optional*, defaults to `False`):
-            Whether to condition the hidden states on the timestep.

    Returns:
        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
@@ -529,7 +451,6 @@ class UNetMidBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        resnet_groups = (
@@ -555,17 +476,13 @@ class UNetMidBlock3D(nn.Module):
                    norm_layer=norm_layer,
                    inject_noise=inject_noise,
                    timestep_conditioning=timestep_conditioning,
-                    spatial_padding_mode=spatial_padding_mode,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        causal: bool = True,
-        timestep: Optional[torch.Tensor] = None,
+        self, hidden_states: torch.FloatTensor, causal: bool = True, timestep: Optional[torch.Tensor] = None
    ) -> torch.FloatTensor:
        timestep_embed = None
        if self.timestep_conditioning:
@@ -590,62 +507,9 @@ class UNetMidBlock3D(nn.Module):
        return hidden_states


-class SpaceToDepthDownsample(nn.Module):
-    def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
-        super().__init__()
-        self.stride = stride
-        self.group_size = in_channels * math.prod(stride) // out_channels
-        self.conv = make_conv_nd(
-            dims=dims,
-            in_channels=in_channels,
-            out_channels=out_channels // math.prod(stride),
-            kernel_size=3,
-            stride=1,
-            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
-        )
-
-    def forward(self, x, causal: bool = True):
-        if self.stride[0] == 2:
-            x = torch.cat(
-                [x[:, :, :1, :, :], x], dim=2
-            )  # duplicate first frames for padding
-
-        # skip connection
-        x_in = rearrange(
-            x,
-            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-        x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
-        x_in = x_in.mean(dim=2)
-
-        # conv
-        x = self.conv(x, causal=causal)
-        x = rearrange(
-            x,
-            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-
-        x = x + x_in
-
-        return x
-
-
 class DepthToSpaceUpsample(nn.Module):
    def __init__(
-        self,
-        dims,
-        in_channels,
-        stride,
-        residual=False,
-        out_channels_reduction_factor=1,
-        spatial_padding_mode="zeros",
+        self, dims, in_channels, stride, residual=False, out_channels_reduction_factor=1
    ):
        super().__init__()
        self.stride = stride
@@ -659,7 +523,6 @@ class DepthToSpaceUpsample(nn.Module):
            kernel_size=3,
            stride=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )
        self.residual = residual
        self.out_channels_reduction_factor = out_channels_reduction_factor
@@ -695,7 +558,7 @@ class DepthToSpaceUpsample(nn.Module):
 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
        super().__init__()
-        self.norm = ops.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)

    def forward(self, x):
        x = rearrange(x, "b c d h w -> b d h w c")
@@ -728,7 +591,6 @@ class ResnetBlock3D(nn.Module):
        norm_layer: str = "group_norm",
        inject_noise: bool = False,
        timestep_conditioning: bool = False,
-        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.in_channels = in_channels
@@ -755,7 +617,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        if inject_noise:
@@ -780,7 +641,6 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
-            spatial_padding_mode=spatial_padding_mode,
        )

        if inject_noise:
@@ -941,44 +801,9 @@ class processor(nn.Module):
        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)

 class VideoVAE(nn.Module):
-    def __init__(self, version=0, config=None):
+    def __init__(self, version=0):
        super().__init__()

-        if config is None:
-            config = self.guess_config(version)
-
-        self.timestep_conditioning = config.get("timestep_conditioning", False)
-        double_z = config.get("double_z", True)
-        latent_log_var = config.get(
-            "latent_log_var", "per_channel" if double_z else "none"
-        )
-
-        self.encoder = Encoder(
-            dims=config["dims"],
-            in_channels=config.get("in_channels", 3),
-            out_channels=config["latent_channels"],
-            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
-            patch_size=config.get("patch_size", 1),
-            latent_log_var=latent_log_var,
-            norm_layer=config.get("norm_layer", "group_norm"),
-            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
-        )
-
-        self.decoder = Decoder(
-            dims=config["dims"],
-            in_channels=config["latent_channels"],
-            out_channels=config.get("out_channels", 3),
-            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
-            patch_size=config.get("patch_size", 1),
-            norm_layer=config.get("norm_layer", "group_norm"),
-            causal=config.get("causal_decoder", False),
-            timestep_conditioning=self.timestep_conditioning,
-            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
-        )
-
-        self.per_channel_statistics = processor()
-
-    def guess_config(self, version):
        if version == 0:
            config = {
                "_class_name": "CausalVideoAutoencoder",
@@ -1005,7 +830,7 @@ class VideoVAE(nn.Module):
                "use_quant_conv": False,
                "causal_decoder": False,
            }
-        elif version == 1:
+        else:
            config = {
                "_class_name": "CausalVideoAutoencoder",
                "dims": 3,
@@ -1041,47 +866,37 @@ class VideoVAE(nn.Module):
                "causal_decoder": False,
                "timestep_conditioning": True,
            }
-        else:
-            config = {
-                "_class_name": "CausalVideoAutoencoder",
-                "dims": 3,
-                "in_channels": 3,
-                "out_channels": 3,
-                "latent_channels": 128,
-                "encoder_blocks": [
-                    ["res_x", {"num_layers": 4}],
-                    ["compress_space_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 6}],
-                    ["compress_time_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 6}],
-                    ["compress_all_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 2}],
-                    ["compress_all_res", {"multiplier": 2}],
-                    ["res_x", {"num_layers": 2}]
-                ],
-                "decoder_blocks": [
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}],
-                    ["compress_all", {"residual": True, "multiplier": 2}],
-                    ["res_x", {"num_layers": 5, "inject_noise": False}]
-                ],
-                "scaling_factor": 1.0,
-                "norm_layer": "pixel_norm",
-                "patch_size": 4,
-                "latent_log_var": "uniform",
-                "use_quant_conv": False,
-                "causal_decoder": False,
-                "timestep_conditioning": True
-            }
-        return config
+
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+
+        self.encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+        )
+
+        self.decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+            timestep_conditioning=config.get("timestep_conditioning", False),
+        )
+
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
+        self.per_channel_statistics = processor()

    def encode(self, x):
-        frames_count = x.shape[2]
-        if ((frames_count - 1) % 8) != 0:
-            raise ValueError("Invalid number of frames: Encode input must have 1 + 8 * x frames (e.g., 1, 9, 17, ...). Please check your input.")
        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
        return self.per_channel_statistics.normalize(means)

--- a/comfy/ldm/lightricks/vae/conv_nd_factory.py
+++ b/comfy/ldm/lightricks/vae/conv_nd_factory.py
@@ -17,11 +17,7 @@ def make_conv_nd(
    groups=1,
    bias=True,
    causal=False,
-    spatial_padding_mode="zeros",
-    temporal_padding_mode="zeros",
 ):
-    if not (spatial_padding_mode == temporal_padding_mode or causal):
-        raise NotImplementedError("spatial and temporal padding modes must be equal")
    if dims == 2:
        return ops.Conv2d(
            in_channels=in_channels,
@@ -32,7 +28,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    elif dims == 3:
        if causal:
@@ -45,7 +40,6 @@ def make_conv_nd(
                dilation=dilation,
                groups=groups,
                bias=bias,
-                spatial_padding_mode=spatial_padding_mode,
            )
        return ops.Conv3d(
            in_channels=in_channels,
@@ -56,7 +50,6 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    elif dims == (2, 1):
        return DualConv3d(
@@ -66,7 +59,6 @@ def make_conv_nd(
            stride=stride,
            padding=padding,
            bias=bias,
-            padding_mode=spatial_padding_mode,
        )
    else:
        raise ValueError(f"unsupported dimensions: {dims}")
--- a/comfy/ldm/lightricks/vae/dual_conv3d.py
+++ b/comfy/ldm/lightricks/vae/dual_conv3d.py
@@ -18,13 +18,11 @@ class DualConv3d(nn.Module):
        dilation: Union[int, Tuple[int, int, int]] = 1,
        groups=1,
        bias=True,
-        padding_mode="zeros",
    ):
        super(DualConv3d, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
-        self.padding_mode = padding_mode
        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size, kernel_size)
@@ -110,7 +108,6 @@ class DualConv3d(nn.Module):
            self.padding1,
            self.dilation1,
            self.groups,
-            padding_mode=self.padding_mode,
        )

        if skip_time_conv:
@@ -125,7 +122,6 @@ class DualConv3d(nn.Module):
            self.padding2,
            self.dilation2,
            self.groups,
-            padding_mode=self.padding_mode,
        )

        return x
@@ -141,16 +137,7 @@ class DualConv3d(nn.Module):
        stride1 = (self.stride1[1], self.stride1[2])
        padding1 = (self.padding1[1], self.padding1[2])
        dilation1 = (self.dilation1[1], self.dilation1[2])
-        x = F.conv2d(
-            x,
-            weight1,
-            self.bias1,
-            stride1,
-            padding1,
-            dilation1,
-            self.groups,
-            padding_mode=self.padding_mode,
-        )
+        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)

        _, _, h, w = x.shape

@@ -167,16 +154,7 @@ class DualConv3d(nn.Module):
        stride2 = self.stride2[0]
        padding2 = self.padding2[0]
        dilation2 = self.dilation2[0]
-        x = F.conv1d(
-            x,
-            weight2,
-            self.bias2,
-            stride2,
-            padding2,
-            dilation2,
-            self.groups,
-            padding_mode=self.padding_mode,
-        )
+        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)

        return x
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -161,13 +161,9 @@ class BaseModel(torch.nn.Module):
                    extra = extra.to(dtype)
            extra_conds[o] = extra

-        t = self.process_timestep(t, x=x, **extra_conds)
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
        return self.model_sampling.calculate_denoised(sigma, model_output, x)

-    def process_timestep(self, timestep, **kwargs):
-        return timestep
-
    def get_dtype(self):
        return self.diffusion_model.dtype

@@ -189,11 +185,6 @@ class BaseModel(torch.nn.Module):

            if concat_latent_image.shape[1:] != noise.shape[1:]:
                concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
-                if noise.ndim == 5:
-                    if concat_latent_image.shape[-3] < noise.shape[-3]:
-                        concat_latent_image = torch.nn.functional.pad(concat_latent_image, (0, 0, 0, 0, 0, noise.shape[-3] - concat_latent_image.shape[-3]), "constant", 0)
-                    else:
-                        concat_latent_image = concat_latent_image[:, :, :noise.shape[-3]]

            concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])

@@ -222,11 +213,6 @@ class BaseModel(torch.nn.Module):
                        cond_concat.append(self.blank_inpaint_image_like(noise))
                    elif ck == "mask_inverted":
                        cond_concat.append(torch.zeros_like(noise)[:, :1])
-                if ck == "concat_image":
-                    if concat_latent_image is not None:
-                        cond_concat.append(concat_latent_image.to(device))
-                    else:
-                        cond_concat.append(torch.zeros_like(noise))
            data = torch.cat(cond_concat, dim=1)
            return data
        return None
@@ -859,26 +845,17 @@ class LTXV(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)

+        guiding_latent = kwargs.get("guiding_latent", None)
+        if guiding_latent is not None:
+            out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
+
+        guiding_latent_noise_scale = kwargs.get("guiding_latent_noise_scale", None)
+        if guiding_latent_noise_scale is not None:
+            out["guiding_latent_noise_scale"] = comfy.conds.CONDConstant(guiding_latent_noise_scale)
+
        out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
-
-        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
-        if denoise_mask is not None:
-            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
-
-        keyframe_idxs = kwargs.get("keyframe_idxs", None)
-        if keyframe_idxs is not None:
-            out['keyframe_idxs'] = comfy.conds.CONDRegular(keyframe_idxs)
-
        return out

-    def process_timestep(self, timestep, x, denoise_mask=None, **kwargs):
-        if denoise_mask is None:
-            return timestep
-        return self.diffusion_model.patchifier.patchify(((denoise_mask) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1)))[:, :1])[0]
-
-    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
-        return latent_image
-
 class HunyuanVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo)
@@ -895,24 +872,20 @@ class HunyuanVideo(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)

+        image = kwargs.get("concat_latent_image", None)
+        noise = kwargs.get("noise", None)
+
+        if image is not None:
+            padding_shape = (noise.shape[0], 16, noise.shape[2] - 1, noise.shape[3], noise.shape[4])
+            latent_padding = torch.zeros(padding_shape, device=noise.device, dtype=noise.dtype)
+            image_latents = torch.cat([image.to(noise), latent_padding], dim=2)
+            out['c_concat'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_latents))
+
        guidance = kwargs.get("guidance", 6.0)
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        return out

-
-class HunyuanVideoI2V(HunyuanVideo):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device)
-        self.concat_keys = ("concat_image", "mask_inverted")
-
-
-class HunyuanVideoSkyreelsI2V(HunyuanVideo):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device)
-        self.concat_keys = ("concat_image",)
-
-
 class CosmosVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EDM, image_to_video=False, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cosmos.model.GeneralDIT)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -1,4 +1,3 @@
-import json
 import comfy.supported_models
 import comfy.supported_models_base
 import comfy.utils
@@ -34,7 +33,7 @@ def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
        return last_transformer_depth, context_dim, use_linear_in_transformer, time_stack, time_stack_cross
    return None

-def detect_unet_config(state_dict, key_prefix, metadata=None):
+def detect_unet_config(state_dict, key_prefix):
    state_dict_keys = list(state_dict.keys())

    if '{}joint_blocks.0.context_block.attn.qkv.weight'.format(key_prefix) in state_dict_keys: #mmdit model
@@ -211,8 +210,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
    if '{}adaln_single.emb.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: #Lightricks ltxv
        dit_config = {}
        dit_config["image_model"] = "ltxv"
-        if metadata is not None and "config" in metadata:
-            dit_config.update(json.loads(metadata["config"]).get("transformer", {}))
        return dit_config

    if '{}t_block.1.weight'.format(key_prefix) in state_dict_keys: # PixArt
@@ -457,8 +454,8 @@ def model_config_from_unet_config(unet_config, state_dict=None):
    logging.error("no match {}".format(unet_config))
    return None

-def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False, metadata=None):
-    unet_config = detect_unet_config(state_dict, unet_key_prefix, metadata=metadata)
+def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False):
+    unet_config = detect_unet_config(state_dict, unet_key_prefix)
    if unet_config is None:
        return None
    model_config = model_config_from_unet_config(unet_config, state_dict)
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -19,12 +19,6 @@ import comfy.hooks
 import scipy.stats
 import numpy

-
-def add_area_dims(area, num_dims):
-    while (len(area) // 2) < num_dims:
-        area = [2147483648] + area[:len(area) // 2] + [0] + area[len(area) // 2:]
-    return area
-
 def get_area_and_mult(conds, x_in, timestep_in):
    dims = tuple(x_in.shape[2:])
    area = None
@@ -40,9 +34,8 @@ def get_area_and_mult(conds, x_in, timestep_in):
            return None
    if 'area' in conds:
        area = list(conds['area'])
-        area = add_area_dims(area, len(dims))
-        if (len(area) // 2) > len(dims):
-            area = area[:len(dims)] + area[len(area) // 2:(len(area) // 2) + len(dims)]
+        while (len(area) // 2) < len(dims):
+            area = [2147483648] + area[:len(area) // 2] + [0] + area[len(area) // 2:]

    if 'strength' in conds:
        strength = conds['strength']
@@ -60,7 +53,7 @@ def get_area_and_mult(conds, x_in, timestep_in):
        if "mask_strength" in conds:
            mask_strength = conds["mask_strength"]
        mask = conds['mask']
-        assert (mask.shape[1:] == x_in.shape[2:])
+        assert(mask.shape[1:] == x_in.shape[2:])

        mask = mask[:input_x.shape[0]]
        if area is not None:
@@ -74,17 +67,16 @@ def get_area_and_mult(conds, x_in, timestep_in):
    mult = mask * strength

    if 'mask' not in conds and area is not None:
-        fuzz = 8
+        rr = 8
        for i in range(len(dims)):
-            rr = min(fuzz, mult.shape[2 + i] // 4)
            if area[len(dims) + i] != 0:
                for t in range(rr):
                    m = mult.narrow(i + 2, t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))
            if (area[i] + area[len(dims) + i]) < x_in.shape[i + 2]:
                for t in range(rr):
                    m = mult.narrow(i + 2, area[i] - 1 - t, 1)
-                    m *= ((1.0 / rr) * (t + 1))
+                    m *= ((1.0/rr) * (t + 1))

    conditioning = {}
    model_conds = conds["model_conds"]
@@ -559,37 +551,25 @@ def resolve_areas_and_cond_masks(conditions, h, w, device):
    logging.warning("WARNING: The comfy.samplers.resolve_areas_and_cond_masks function is deprecated please use the resolve_areas_and_cond_masks_multidim one instead.")
    return resolve_areas_and_cond_masks_multidim(conditions, [h, w], device)

-def create_cond_with_same_area_if_none(conds, c):
+def create_cond_with_same_area_if_none(conds, c): #TODO: handle dim != 2
    if 'area' not in c:
        return

-    def area_inside(a, area_cmp):
-        a = add_area_dims(a, len(area_cmp) // 2)
-        area_cmp = add_area_dims(area_cmp, len(a) // 2)
-
-        a_l = len(a) // 2
-        area_cmp_l = len(area_cmp) // 2
-        for i in range(min(a_l, area_cmp_l)):
-            if a[a_l + i] < area_cmp[area_cmp_l + i]:
-                return False
-        for i in range(min(a_l, area_cmp_l)):
-            if (a[i] + a[a_l + i]) > (area_cmp[i] + area_cmp[area_cmp_l + i]):
-                return False
-        return True
-
    c_area = c['area']
    smallest = None
    for x in conds:
        if 'area' in x:
            a = x['area']
-            if area_inside(c_area, a):
-                if smallest is None:
-                    smallest = x
-                elif 'area' not in smallest:
-                    smallest = x
-                else:
-                    if math.prod(smallest['area'][:len(smallest['area']) // 2]) > math.prod(a[:len(a) // 2]):
-                        smallest = x
+            if c_area[2] >= a[2] and c_area[3] >= a[3]:
+                if a[0] + a[2] >= c_area[0] + c_area[2]:
+                    if a[1] + a[3] >= c_area[1] + c_area[3]:
+                        if smallest is None:
+                            smallest = x
+                        elif 'area' not in smallest:
+                            smallest = x
+                        else:
+                            if smallest['area'][0] * smallest['area'][1] > a[0] * a[1]:
+                                smallest = x
        else:
            if smallest is None:
                smallest = x
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-import json
 import torch
 from enum import Enum
 import logging
@@ -135,8 +134,8 @@ class CLIP:
    def clip_layer(self, layer_idx):
        self.layer_idx = layer_idx

-    def tokenize(self, text, return_word_ids=False, **kwargs):
-        return self.tokenizer.tokenize_with_weights(text, return_word_ids, **kwargs)
+    def tokenize(self, text, return_word_ids=False):
+        return self.tokenizer.tokenize_with_weights(text, return_word_ids)

    def add_hooks_to_dict(self, pooled_dict: dict[str]):
        if self.apply_hooks_to_conds:
@@ -250,7 +249,7 @@ class CLIP:
        return self.patcher.get_key_patches()

 class VAE:
-    def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None):
+    def __init__(self, sd=None, device=None, config=None, dtype=None):
        if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
            sd = diffusers_convert.convert_vae_state_dict(sd)

@@ -358,12 +357,7 @@ class VAE:
                    version = 0
                elif tensor_conv1.shape[0] == 1024:
                    version = 1
-                    if "encoder.down_blocks.1.conv.conv.bias" in sd:
-                        version = 2
-                vae_config = None
-                if metadata is not None and "config" in metadata:
-                    vae_config = json.loads(metadata["config"]).get("vae", None)
-                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version, config=vae_config)
+                self.first_stage_model = comfy.ldm.lightricks.vae.causal_video_autoencoder.VideoVAE(version=version)
                self.latent_channels = 128
                self.latent_dim = 3
                self.memory_used_decode = lambda shape, dtype: (900 * shape[2] * shape[3] * shape[4] * (8 * 8 * 8)) * model_management.dtype_size(dtype)
@@ -879,13 +873,13 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl
    return (model, clip, vae)

 def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
-    sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
-    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
+    sd = comfy.utils.load_torch_file(ckpt_path)
+    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
    return out

-def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
+def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}):
    clip = None
    clipvision = None
    vae = None
@@ -897,7 +891,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
    load_device = model_management.get_torch_device()

-    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
+    model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
    if model_config is None:
        return None

@@ -926,7 +920,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    if output_vae:
        vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True)
        vae_sd = model_config.process_vae_state_dict(vae_sd)
-        vae = VAE(sd=vae_sd, metadata=metadata)
+        vae = VAE(sd=vae_sd)

    if output_clip:
        clip_target = model_config.clip_target(state_dict=sd)
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -158,93 +158,71 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        self.layer_idx = self.options_default[1]
        self.return_projected_pooled = self.options_default[2]

-    def process_tokens(self, tokens, device):
-        end_token = self.special_tokens.get("end", None)
-        if end_token is None:
-            cmp_token = self.special_tokens.get("pad", -1)
-        else:
-            cmp_token = end_token
-
-        embeds_out = []
-        attention_masks = []
-        num_tokens = []
+    def set_up_textual_embeddings(self, tokens, current_embeds):
+        out_tokens = []
+        next_new_token = token_dict_size = current_embeds.weight.shape[0]
+        embedding_weights = []

        for x in tokens:
-            attention_mask = []
            tokens_temp = []
-            other_embeds = []
-            eos = False
-            index = 0
            for y in x:
                if isinstance(y, numbers.Integral):
-                    if eos:
-                        attention_mask.append(0)
+                    tokens_temp += [int(y)]
+                else:
+                    if y.shape[0] == current_embeds.weight.shape[1]:
+                        embedding_weights += [y]
+                        tokens_temp += [next_new_token]
+                        next_new_token += 1
                    else:
-                        attention_mask.append(1)
-                    token = int(y)
-                    tokens_temp += [token]
-                    if not eos and token == cmp_token:
-                        if end_token is None:
-                            attention_mask[-1] = 0
-                        eos = True
-                else:
-                    other_embeds.append((index, y))
-                index += 1
+                        logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(y.shape[0], current_embeds.weight.shape[1]))
+            while len(tokens_temp) < len(x):
+                tokens_temp += [self.special_tokens["pad"]]
+            out_tokens += [tokens_temp]

-            tokens_embed = torch.tensor([tokens_temp], device=device, dtype=torch.long)
-            tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32)
-            index = 0
-            pad_extra = 0
-            for o in other_embeds:
-                emb = o[1]
-                if torch.is_tensor(emb):
-                    emb = {"type": "embedding", "data": emb}
+        n = token_dict_size
+        if len(embedding_weights) > 0:
+            new_embedding = self.operations.Embedding(next_new_token + 1, current_embeds.weight.shape[1], device=current_embeds.weight.device, dtype=current_embeds.weight.dtype)
+            new_embedding.weight[:token_dict_size] = current_embeds.weight
+            for x in embedding_weights:
+                new_embedding.weight[n] = x
+                n += 1
+            self.transformer.set_input_embeddings(new_embedding)

-                emb_type = emb.get("type", None)
-                if emb_type == "embedding":
-                    emb = emb.get("data", None)
-                else:
-                    if hasattr(self.transformer, "preprocess_embed"):
-                        emb = self.transformer.preprocess_embed(emb, device=device)
-                    else:
-                        emb = None
+        processed_tokens = []
+        for x in out_tokens:
+            processed_tokens += [list(map(lambda a: n if a == -1 else a, x))] #The EOS token should always be the largest one

-                if emb is None:
-                    index += -1
-                    continue
-
-                ind = index + o[0]
-                emb = emb.view(1, -1, emb.shape[-1]).to(device=device, dtype=torch.float32)
-                emb_shape = emb.shape[1]
-                if emb.shape[-1] == tokens_embed.shape[-1]:
-                    tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1)
-                    attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:]
-                    index += emb_shape - 1
-                else:
-                    index += -1
-                    pad_extra += emb_shape
-                    logging.warning("WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {}".format(emb.shape[-1], tokens_embed.shape[-1]))
-
-            if pad_extra > 0:
-                padd_embed = self.transformer.get_input_embeddings()(torch.tensor([[self.special_tokens["pad"]] * pad_extra], device=device, dtype=torch.long), out_dtype=torch.float32)
-                tokens_embed = torch.cat([tokens_embed, padd_embed], dim=1)
-                attention_mask = attention_mask + [0] * pad_extra
-
-            embeds_out.append(tokens_embed)
-            attention_masks.append(attention_mask)
-            num_tokens.append(sum(attention_mask))
-
-        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens
+        return processed_tokens

    def forward(self, tokens):
-        device = self.transformer.get_input_embeddings().weight.device
-        embeds, attention_mask, num_tokens = self.process_tokens(tokens, device)
+        backup_embeds = self.transformer.get_input_embeddings()
+        device = backup_embeds.weight.device
+        tokens = self.set_up_textual_embeddings(tokens, backup_embeds)
+        tokens = torch.LongTensor(tokens).to(device)
+
+        attention_mask = None
+        if self.enable_attention_masks or self.zero_out_masked or self.return_attention_masks:
+            attention_mask = torch.zeros_like(tokens)
+            end_token = self.special_tokens.get("end", None)
+            if end_token is None:
+                cmp_token = self.special_tokens.get("pad", -1)
+            else:
+                cmp_token = end_token
+
+            for x in range(attention_mask.shape[0]):
+                for y in range(attention_mask.shape[1]):
+                    attention_mask[x, y] = 1
+                    if tokens[x, y] == cmp_token:
+                        if end_token is None:
+                            attention_mask[x, y] = 0
+                        break

        attention_mask_model = None
        if self.enable_attention_masks:
            attention_mask_model = attention_mask

-        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        self.transformer.set_input_embeddings(backup_embeds)

        if self.layer == "last":
            z = outputs[0].float()
@@ -504,7 +482,7 @@ class SDTokenizer:
        return (embed, leftover)


-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        '''
        Takes a prompt and converts it to a list of (token, weight, word id) elements.
        Tokens can both be integer tokens and pre computed CLIP tensors.
@@ -618,7 +596,7 @@ class SD1Tokenizer:
        tokenizer = tokenizer_data.get("{}_tokenizer_class".format(self.clip), tokenizer)
        setattr(self, self.clip, tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data))

-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out[self.clip_name] = getattr(self, self.clip).tokenize_with_weights(text, return_word_ids)
        return out
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@@ -26,7 +26,7 @@ class SDXLTokenizer:
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)

-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -762,7 +762,7 @@ class LTXV(supported_models_base.BASE):
    unet_extra_config = {}
    latent_format = latent_formats.LTXV

-    memory_usage_factor = 5.5 # TODO: img2vid is about 2x vs txt2vid
+    memory_usage_factor = 2.7

    supported_inference_dtypes = [torch.bfloat16, torch.float32]

@@ -826,26 +826,6 @@ class HunyuanVideo(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}llama.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer, comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**hunyuan_detect))

-class HunyuanVideoI2V(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "in_channels": 33,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanVideoI2V(self, device=device)
-        return out
-
-class HunyuanVideoSkyreelsI2V(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "in_channels": 32,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanVideoSkyreelsI2V(self, device=device)
-        return out
-
 class CosmosT2V(supported_models_base.BASE):
    unet_config = {
        "image_model": "cosmos",
@@ -959,6 +939,6 @@ class WAN21_I2V(WAN21_T2V):
        out = model_base.WAN21(self, image_to_video=True, device=device)
        return out

-models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V]
+models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V]

 models += [SVD_img2vid]
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -93,11 +93,8 @@ class BertEmbeddings(torch.nn.Module):

        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)

-    def forward(self, input_tokens, embeds=None, token_type_ids=None, dtype=None):
-        if embeds is not None:
-            x = embeds
-        else:
-            x = self.word_embeddings(input_tokens, out_dtype=dtype)
+    def forward(self, input_tokens, token_type_ids=None, dtype=None):
+        x = self.word_embeddings(input_tokens, out_dtype=dtype)
        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
        if token_type_ids is not None:
            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
@@ -116,8 +113,8 @@ class BertModel_(torch.nn.Module):
        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)

-    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype)
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        x = self.embeddings(input_tokens, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -18,7 +18,7 @@ class FluxTokenizer:
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)

-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@@ -4,7 +4,6 @@ import comfy.text_encoders.llama
 from transformers import LlamaTokenizerFast
 import torch
 import os
-import numbers


 def llama_detect(state_dict, prefix=""):
@@ -23,7 +22,7 @@ def llama_detect(state_dict, prefix=""):
 class LLAMA3Tokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}, min_length=256):
        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "llama_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=128258, min_length=min_length)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=128258, end_token=128009, min_length=min_length)

 class LLAMAModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}):
@@ -39,26 +38,15 @@ class HunyuanVideoTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
-        self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"""  # 95 tokens
+        self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"""  # 95 tokens
        self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1)

-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, image_embeds=None, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)

-        if llama_template is None:
-            llama_text = self.llama_template.format(text)
-        else:
-            llama_text = llama_template.format(text)
-        llama_text_tokens = self.llama.tokenize_with_weights(llama_text, return_word_ids)
-        embed_count = 0
-        for r in llama_text_tokens:
-            for i in range(len(r)):
-                if r[i][0] == 128257:
-                    if image_embeds is not None and embed_count < image_embeds.shape[0]:
-                        r[i] = ({"type": "embedding", "data": image_embeds[embed_count], "original_type": "image"},) + r[i][1:]
-                        embed_count += 1
-        out["llama"] = llama_text_tokens
+        llama_text = "{}{}".format(self.llama_template, text)
+        out["llama"] = self.llama.tokenize_with_weights(llama_text, return_word_ids)
        return out

    def untokenize(self, token_weight_pair):
@@ -92,45 +80,20 @@ class HunyuanVideoClipModel(torch.nn.Module):
        llama_out, llama_pooled, llama_extra_out = self.llama.encode_token_weights(token_weight_pairs_llama)

        template_end = 0
-        image_start = None
-        image_end = None
-        extra_sizes = 0
-        user_end = 9999999999999
-
-        tok_pairs = token_weight_pairs_llama[0]
-        for i, v in enumerate(tok_pairs):
-            elem = v[0]
-            if not torch.is_tensor(elem):
-                if isinstance(elem, numbers.Integral):
-                    if elem == 128006:
-                        if tok_pairs[i + 1][0] == 882:
-                            if tok_pairs[i + 2][0] == 128007:
-                                template_end = i + 2
-                                user_end = -1
-                    if elem == 128009 and user_end == -1:
-                        user_end = i + 1
-                else:
-                    if elem.get("original_type") == "image":
-                        elem_size = elem.get("data").shape[0]
-                        if image_start is None:
-                            image_start = i + extra_sizes
-                            image_end = i + elem_size + extra_sizes
-                        extra_sizes += elem_size - 1
+        for i, v in enumerate(token_weight_pairs_llama[0]):
+            if v[0] == 128007:  # <|end_header_id|>
+                template_end = i

        if llama_out.shape[1] > (template_end + 2):
-            if tok_pairs[template_end + 1][0] == 271:
+            if token_weight_pairs_llama[0][template_end + 1][0] == 271:
                template_end += 2
-        llama_output = llama_out[:, template_end + extra_sizes:user_end + extra_sizes]
-        llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end + extra_sizes:user_end + extra_sizes]
+        llama_out = llama_out[:, template_end:]
+        llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end:]
        if llama_extra_out["attention_mask"].sum() == torch.numel(llama_extra_out["attention_mask"]):
            llama_extra_out.pop("attention_mask")  # attention mask is useless if no masked elements

-        if image_start is not None:
-            image_output = llama_out[:, image_start: image_end]
-            llama_output = torch.cat([image_output[:, ::2], llama_output], dim=1)
-
        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
-        return llama_output, l_pooled, llama_extra_out
+        return llama_out, l_pooled, llama_extra_out

    def load_sd(self, sd):
        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
--- a/comfy/text_encoders/hydit.py
+++ b/comfy/text_encoders/hydit.py
@@ -37,7 +37,7 @@ class HyditTokenizer:
        self.hydit_clip = HyditBertTokenizer(embedding_directory=embedding_directory)
        self.mt5xl = MT5XLTokenizer(tokenizer_data={"spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)

-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["hydit_clip"] = self.hydit_clip.tokenize_with_weights(text, return_word_ids)
        out["mt5xl"] = self.mt5xl.tokenize_with_weights(text, return_word_ids)
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -241,11 +241,8 @@ class Llama2_(nn.Module):
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)

-    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        if embeds is not None:
-            x = embeds
-        else:
-            x = self.embed_tokens(x, out_dtype=dtype)
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        x = self.embed_tokens(x, out_dtype=dtype)

        if self.normalize_in:
            x *= self.config.hidden_size ** 0.5
--- a/comfy/text_encoders/sd3_clip.py
+++ b/comfy/text_encoders/sd3_clip.py
@@ -43,7 +43,7 @@ class SD3Tokenizer:
        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory)
        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)

-    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@@ -239,11 +239,8 @@ class T5(torch.nn.Module):
    def set_input_embeddings(self, embeddings):
        self.shared = embeddings

-    def forward(self, input_ids, attention_mask, embeds=None, num_tokens=None, **kwargs):
-        if input_ids is None:
-            x = embeds
-        else:
-            x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
+    def forward(self, input_ids, *args, **kwargs):
+        x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
        if self.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
            x = torch.nan_to_num(x) #Fix for fp8 T5 base
-        return self.encoder(x, attention_mask=attention_mask, **kwargs)
+        return self.encoder(x, *args, **kwargs)
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -46,18 +46,12 @@ if hasattr(torch.serialization, "add_safe_globals"):  # TODO: this was added in
 else:
    logging.info("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended.")

-def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
+def load_torch_file(ckpt, safe_load=False, device=None):
    if device is None:
        device = torch.device("cpu")
-    metadata = None
    if ckpt.lower().endswith(".safetensors") or ckpt.lower().endswith(".sft"):
        try:
-            with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
-                sd = {}
-                for k in f.keys():
-                    sd[k] = f.get_tensor(k)
-                if return_metadata:
-                    metadata = f.metadata()
+            sd = safetensors.torch.load_file(ckpt, device=device.type)
        except Exception as e:
            if len(e.args) > 0:
                message = e.args[0]
@@ -83,7 +77,7 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
                    sd = pl_sd
            else:
                sd = pl_sd
-    return (sd, metadata) if return_metadata else sd
+    return sd

 def save_torch_file(sd, ckpt, metadata=None):
    if metadata is not None:
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import torchaudio
 import torch
 import comfy.model_management
@@ -12,7 +10,6 @@ import random
 import hashlib
 import node_helpers
 from comfy.cli_args import args
-from comfy.comfy_types import FileLocator

 class EmptyLatentAudio:
    def __init__(self):
@@ -167,7 +164,7 @@ class SaveAudio:
    def save_audio(self, audio, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
        filename_prefix += self.prefix_append
        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
-        results: list[FileLocator] = []
+        results = list()

        metadata = {}
        if not args.disable_metadata:
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -454,7 +454,7 @@ class SamplerCustom:
        return {"required":
                    {"model": ("MODEL",),
                    "add_noise": ("BOOLEAN", {"default": True}),
-                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True}),
+                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
                    "positive": ("CONDITIONING", ),
                    "negative": ("CONDITIONING", ),
@@ -605,16 +605,10 @@ class DisableNoise:
 class RandomNoise(DisableNoise):
    @classmethod
    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "noise_seed": ("INT", {
-                    "default": 0,
-                    "min": 0,
-                    "max": 0xffffffffffffffff,
-                    "control_after_generate": True,
-                }),
-            }
-        }
+        return {"required":{
+                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+                     }
+                }

    def get_noise(self, noise_seed):
        return (Noise_RandomNoise(noise_seed),)
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@@ -1,5 +1,4 @@
 import nodes
-import node_helpers
 import torch
 import comfy.model_management

@@ -39,73 +38,7 @@ class EmptyHunyuanLatentVideo:
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        return ({"samples":latent}, )

-PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
-    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
-    "1. The main content and theme of the video."
-    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
-    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
-    "4. background environment, light, style and atmosphere."
-    "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
-    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n"
-)
-
-class TextEncodeHunyuanVideo_ImageToVideo:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "clip": ("CLIP", ),
-            "clip_vision_output": ("CLIP_VISION_OUTPUT", ),
-            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
-            }}
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "encode"
-
-    CATEGORY = "advanced/conditioning"
-
-    def encode(self, clip, clip_vision_output, prompt):
-        tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected)
-        return (clip.encode_from_tokens_scheduled(tokens), )
-
-
-class HunyuanImageToVideo:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "vae": ("VAE", ),
-                             "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
-                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
-                },
-                "optional": {"start_image": ("IMAGE", ),
-                }}
-
-    RETURN_TYPES = ("CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "latent")
-    FUNCTION = "encode"
-
-    CATEGORY = "conditioning/video_models"
-
-    def encode(self, positive, vae, width, height, length, batch_size, start_image=None):
-        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        if start_image is not None:
-            start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-
-            concat_latent_image = vae.encode(start_image)
-            mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
-            mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
-
-            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
-
-        out_latent = {}
-        out_latent["samples"] = latent
-        return (positive, out_latent)
-
-
 NODE_CLASS_MAPPINGS = {
    "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
-    "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
    "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
-    "HunyuanImageToVideo": HunyuanImageToVideo,
 }
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import nodes
 import folder_paths
 from comfy.cli_args import args
@@ -11,8 +9,6 @@ import numpy as np
 import json
 import os

-from comfy.comfy_types import FileLocator
-
 MAX_RESOLUTION = nodes.MAX_RESOLUTION

 class ImageCrop:
@@ -103,7 +99,7 @@ class SaveAnimatedWEBP:
        method = self.methods.get(method)
        filename_prefix += self.prefix_append
        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0])
-        results: list[FileLocator] = []
+        results = list()
        pil_images = []
        for image in images:
            i = 255. * image.cpu().numpy()
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@@ -1,14 +1,9 @@
-import io
 import nodes
 import node_helpers
 import torch
 import comfy.model_management
 import comfy.model_sampling
-import comfy.utils
 import math
-import numpy as np
-import av
-from comfy.ldm.lightricks.symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords

 class EmptyLTXVLatentVideo:
    @classmethod
@@ -38,6 +33,7 @@ class LTXVImgToVideo:
                             "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
                             "length": ("INT", {"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8}),
                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                             "image_noise_scale": ("FLOAT", {"default": 0.15, "min": 0, "max": 1.0, "step": 0.01, "tooltip": "Amount of noise to apply on conditioning image latent."})
                             }}

    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
@@ -46,217 +42,16 @@ class LTXVImgToVideo:
    CATEGORY = "conditioning/video_models"
    FUNCTION = "generate"

-    def generate(self, positive, negative, image, vae, width, height, length, batch_size):
+    def generate(self, positive, negative, image, vae, width, height, length, batch_size, image_noise_scale):
        pixels = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
        encode_pixels = pixels[:, :, :, :3]
        t = vae.encode(encode_pixels)
+        positive = node_helpers.conditioning_set_values(positive, {"guiding_latent": t, "guiding_latent_noise_scale": image_noise_scale})
+        negative = node_helpers.conditioning_set_values(negative, {"guiding_latent": t, "guiding_latent_noise_scale": image_noise_scale})

        latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
        latent[:, :, :t.shape[2]] = t
-
-        conditioning_latent_frames_mask = torch.ones(
-            (batch_size, 1, latent.shape[2], 1, 1),
-            dtype=torch.float32,
-            device=latent.device,
-        )
-        conditioning_latent_frames_mask[:, :, :t.shape[2]] = 0
-
-        return (positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}, )
-
-
-def conditioning_get_any_value(conditioning, key, default=None):
-    for t in conditioning:
-        if key in t[1]:
-            return t[1][key]
-    return default
-
-
-def get_noise_mask(latent):
-    noise_mask = latent.get("noise_mask", None)
-    latent_image = latent["samples"]
-    if noise_mask is None:
-        batch_size, _, latent_length, _, _ = latent_image.shape
-        noise_mask = torch.ones(
-            (batch_size, 1, latent_length, 1, 1),
-            dtype=torch.float32,
-            device=latent_image.device,
-        )
-    else:
-        noise_mask = noise_mask.clone()
-    return noise_mask
-
-def get_keyframe_idxs(cond):
-    keyframe_idxs = conditioning_get_any_value(cond, "keyframe_idxs", None)
-    if keyframe_idxs is None:
-        return None, 0
-    num_keyframes = torch.unique(keyframe_idxs[:, 0]).shape[0]
-    return keyframe_idxs, num_keyframes
-
-class LTXVAddGuide:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "vae": ("VAE",),
-                             "latent": ("LATENT",),
-                             "image": ("IMAGE", {"tooltip": "Image or video to condition the latent video on. Must be 8*n + 1 frames." \
-                                                 "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames."}),
-                             "frame_idx": ("INT", {"default": 0, "min": -9999, "max": 9999,
-                                                   "tooltip": "Frame index to start the conditioning at. Must be divisible by 8. " \
-                                                   "If a frame is not divisible by 8, it will be rounded down to the nearest multiple of 8. " \
-                                                   "Negative values are counted from the end of the video."}),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                             }
-            }
-
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "negative", "latent")
-
-    CATEGORY = "conditioning/video_models"
-    FUNCTION = "generate"
-
-    def __init__(self):
-        self._num_prefix_frames = 2
-        self._patchifier = SymmetricPatchifier(1)
-
-    def encode(self, vae, latent_width, latent_height, images, scale_factors):
-        time_scale_factor, width_scale_factor, height_scale_factor = scale_factors
-        images = images[:(images.shape[0] - 1) // time_scale_factor * time_scale_factor + 1]
-        pixels = comfy.utils.common_upscale(images.movedim(-1, 1), latent_width * width_scale_factor, latent_height * height_scale_factor, "bilinear", crop="disabled").movedim(1, -1)
-        encode_pixels = pixels[:, :, :, :3]
-        t = vae.encode(encode_pixels)
-        return encode_pixels, t
-
-    def get_latent_index(self, cond, latent_length, frame_idx, scale_factors):
-        time_scale_factor, _, _ = scale_factors
-        _, num_keyframes = get_keyframe_idxs(cond)
-        latent_count = latent_length - num_keyframes
-        frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * 8 + 1 + frame_idx, 0)
-        frame_idx = frame_idx // time_scale_factor * time_scale_factor # frame index must be divisible by 8
-
-        latent_idx = (frame_idx + time_scale_factor - 1) // time_scale_factor
-
-        return frame_idx, latent_idx
-
-    def add_keyframe_index(self, cond, frame_idx, guiding_latent, scale_factors):
-        keyframe_idxs, _ = get_keyframe_idxs(cond)
-        _, latent_coords = self._patchifier.patchify(guiding_latent)
-        pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, True)
-        pixel_coords[:, 0] += frame_idx
-        if keyframe_idxs is None:
-            keyframe_idxs = pixel_coords
-        else:
-            keyframe_idxs = torch.cat([keyframe_idxs, pixel_coords], dim=2)
-        return node_helpers.conditioning_set_values(cond, {"keyframe_idxs": keyframe_idxs})
-
-    def append_keyframe(self, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors):
-        positive = self.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors)
-        negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors)
-
-        mask = torch.full(
-            (noise_mask.shape[0], 1, guiding_latent.shape[2], 1, 1),
-            1.0 - strength,
-            dtype=noise_mask.dtype,
-            device=noise_mask.device,
-        )
-
-        latent_image = torch.cat([latent_image, guiding_latent], dim=2)
-        noise_mask = torch.cat([noise_mask, mask], dim=2)
-        return positive, negative, latent_image, noise_mask
-
-    def replace_latent_frames(self, latent_image, noise_mask, guiding_latent, latent_idx, strength):
-        cond_length = guiding_latent.shape[2]
-        assert latent_image.shape[2] >= latent_idx + cond_length, "Conditioning frames exceed the length of the latent sequence."
-
-        mask = torch.full(
-            (noise_mask.shape[0], 1, cond_length, 1, 1),
-            1.0 - strength,
-            dtype=noise_mask.dtype,
-            device=noise_mask.device,
-        )
-
-        latent_image = latent_image.clone()
-        noise_mask = noise_mask.clone()
-
-        latent_image[:, :, latent_idx : latent_idx + cond_length] = guiding_latent
-        noise_mask[:, :, latent_idx : latent_idx + cond_length] = mask
-
-        return latent_image, noise_mask
-
-    def generate(self, positive, negative, vae, latent, image, frame_idx, strength):
-        scale_factors = vae.downscale_index_formula
-        latent_image = latent["samples"]
-        noise_mask = get_noise_mask(latent)
-
-        _, _, latent_length, latent_height, latent_width = latent_image.shape
-        image, t = self.encode(vae, latent_width, latent_height, image, scale_factors)
-
-        frame_idx, latent_idx = self.get_latent_index(positive, latent_length, frame_idx, scale_factors)
-        assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence."
-
-        num_prefix_frames = min(self._num_prefix_frames, t.shape[2])
-
-        positive, negative, latent_image, noise_mask = self.append_keyframe(
-            positive,
-            negative,
-            frame_idx,
-            latent_image,
-            noise_mask,
-            t[:, :, :num_prefix_frames],
-            strength,
-            scale_factors,
-        )
-
-        latent_idx += num_prefix_frames
-
-        t = t[:, :, num_prefix_frames:]
-        if t.shape[2] == 0:
-            return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
-
-        latent_image, noise_mask = self.replace_latent_frames(
-            latent_image,
-            noise_mask,
-            t,
-            latent_idx,
-            strength,
-        )
-
-        return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
-
-
-class LTXVCropGuides:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "latent": ("LATENT",),
-                             }
-            }
-
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "negative", "latent")
-
-    CATEGORY = "conditioning/video_models"
-    FUNCTION = "crop"
-
-    def __init__(self):
-        self._patchifier = SymmetricPatchifier(1)
-
-    def crop(self, positive, negative, latent):
-        latent_image = latent["samples"].clone()
-        noise_mask = get_noise_mask(latent)
-
-        _, num_keyframes = get_keyframe_idxs(positive)
-        if num_keyframes == 0:
-            return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
-
-        latent_image = latent_image[:, :, :-num_keyframes]
-        noise_mask = noise_mask[:, :, :-num_keyframes]
-
-        positive = node_helpers.conditioning_set_values(positive, {"keyframe_idxs": None})
-        negative = node_helpers.conditioning_set_values(negative, {"keyframe_idxs": None})
-
-        return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
+        return (positive, negative, {"samples": latent}, )


 class LTXVConditioning:
@@ -379,77 +174,6 @@ class LTXVScheduler:

        return (sigmas,)

-def encode_single_frame(output_file, image_array: np.ndarray, crf):
-    container = av.open(output_file, "w", format="mp4")
-    try:
-        stream = container.add_stream(
-            "h264", rate=1, options={"crf": str(crf), "preset": "veryfast"}
-        )
-        stream.height = image_array.shape[0]
-        stream.width = image_array.shape[1]
-        av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(
-            format="yuv420p"
-        )
-        container.mux(stream.encode(av_frame))
-        container.mux(stream.encode())
-    finally:
-        container.close()
-
-
-def decode_single_frame(video_file):
-    container = av.open(video_file)
-    try:
-        stream = next(s for s in container.streams if s.type == "video")
-        frame = next(container.decode(stream))
-    finally:
-        container.close()
-    return frame.to_ndarray(format="rgb24")
-
-
-def preprocess(image: torch.Tensor, crf=29):
-    if crf == 0:
-        return image
-
-    image_array = (image[:(image.shape[0] // 2) * 2, :(image.shape[1] // 2) * 2] * 255.0).byte().cpu().numpy()
-    with io.BytesIO() as output_file:
-        encode_single_frame(output_file, image_array, crf)
-        video_bytes = output_file.getvalue()
-    with io.BytesIO(video_bytes) as video_file:
-        image_array = decode_single_frame(video_file)
-    tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0
-    return tensor
-
-
-class LTXVPreprocess:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "image": ("IMAGE",),
-                "img_compression": (
-                    "INT",
-                    {
-                        "default": 35,
-                        "min": 0,
-                        "max": 100,
-                        "tooltip": "Amount of compression to apply on image.",
-                    },
-                ),
-            }
-        }
-
-    FUNCTION = "preprocess"
-    RETURN_TYPES = ("IMAGE",)
-    RETURN_NAMES = ("output_image",)
-    CATEGORY = "image"
-
-    def preprocess(self, image, img_compression):
-        if img_compression > 0:
-            output_images = []
-            for i in range(image.shape[0]):
-                output_images.append(preprocess(image[i], img_compression))
-        return (torch.stack(output_images),)
-

 NODE_CLASS_MAPPINGS = {
    "EmptyLTXVLatentVideo": EmptyLTXVLatentVideo,
@@ -457,7 +181,4 @@ NODE_CLASS_MAPPINGS = {
    "ModelSamplingLTXV": ModelSamplingLTXV,
    "LTXVConditioning": LTXVConditioning,
    "LTXVScheduler": LTXVScheduler,
-    "LTXVAddGuide": LTXVAddGuide,
-    "LTXVPreprocess": LTXVPreprocess,
-    "LTXVCropGuides": LTXVCropGuides,
 }
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -1,12 +1,9 @@
-from __future__ import annotations
-
 import os
 import av
 import torch
 import folder_paths
 import json
 from fractions import Fraction
-from comfy.comfy_types import FileLocator


 class SaveWEBM:
@@ -65,7 +62,7 @@ class SaveWEBM:
        container.mux(stream.encode())
        container.close()

-        results: list[FileLocator] = [{
+        results = [{
            "filename": file,
            "subfolder": subfolder,
            "type": self.type
--- a/comfy_extras/nodes_video_model.py
+++ b/comfy_extras/nodes_video_model.py
@@ -4,7 +4,6 @@ import comfy.utils
 import comfy.sd
 import folder_paths
 import comfy_extras.nodes_model_merging
-import node_helpers


 class ImageOnlyCheckpointLoader:
@@ -122,38 +121,12 @@ class ImageOnlyCheckpointSave(comfy_extras.nodes_model_merging.CheckpointSave):
        comfy_extras.nodes_model_merging.save_checkpoint(model, clip_vision=clip_vision, vae=vae, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
        return {}

-
-class ConditioningSetAreaPercentageVideo:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"conditioning": ("CONDITIONING", ),
-                             "width": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
-                             "height": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
-                             "temporal": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
-                             "x": ("FLOAT", {"default": 0, "min": 0, "max": 1.0, "step": 0.01}),
-                             "y": ("FLOAT", {"default": 0, "min": 0, "max": 1.0, "step": 0.01}),
-                             "z": ("FLOAT", {"default": 0, "min": 0, "max": 1.0, "step": 0.01}),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
-                             }}
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "append"
-
-    CATEGORY = "conditioning"
-
-    def append(self, conditioning, width, height, temporal, x, y, z, strength):
-        c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x),
-                                                                "strength": strength,
-                                                                "set_area_to_bounds": False})
-        return (c, )
-
-
 NODE_CLASS_MAPPINGS = {
    "ImageOnlyCheckpointLoader": ImageOnlyCheckpointLoader,
    "SVD_img2vid_Conditioning": SVD_img2vid_Conditioning,
    "VideoLinearCFGGuidance": VideoLinearCFGGuidance,
    "VideoTriangleCFGGuidance": VideoTriangleCFGGuidance,
    "ImageOnlyCheckpointSave": ImageOnlyCheckpointSave,
-    "ConditioningSetAreaPercentageVideo": ConditioningSetAreaPercentageVideo,
 }

 NODE_DISPLAY_NAME_MAPPINGS = {
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.24"
+__version__ = "0.3.18"
--- a/nodes.py
+++ b/nodes.py
@@ -25,7 +25,7 @@ import comfy.sample
 import comfy.sd
 import comfy.utils
 import comfy.controlnet
-from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator
+from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict

 import comfy.clip_vision

@@ -479,7 +479,7 @@ class SaveLatent:

        file = f"{filename}_{counter:05}_.latent"

-        results: list[FileLocator] = []
+        results = list()
        results.append({
            "filename": file,
            "subfolder": subfolder,
@@ -1519,7 +1519,7 @@ class KSampler:
        return {
            "required": {
                "model": ("MODEL", {"tooltip": "The model used for denoising the input latent."}),
-                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True, "tooltip": "The random seed used for creating the noise."}),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "tooltip": "The random seed used for creating the noise."}),
                "steps": ("INT", {"default": 20, "min": 1, "max": 10000, "tooltip": "The number of steps used in the denoising process."}),
                "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01, "tooltip": "The Classifier-Free Guidance scale balances creativity and adherence to the prompt. Higher values result in images more closely matching the prompt however too high values will negatively impact quality."}),
                "sampler_name": (comfy.samplers.KSampler.SAMPLERS, {"tooltip": "The algorithm used when sampling, this can affect the quality, speed, and style of the generated output."}),
@@ -1547,7 +1547,7 @@ class KSamplerAdvanced:
        return {"required":
                    {"model": ("MODEL",),
                    "add_noise": (["enable", "disable"], ),
-                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True}),
+                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
                    "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
                    "sampler_name": (comfy.samplers.KSampler.SAMPLERS, ),
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.24"
+version = "0.3.18"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"