ComfyUI version 0.3.34

Fix torch warning about deprecated function. (#8075 )
Drop support for torch versions below 2.2 on the audio VAEs.
2025-05-12 15:58:28 -04:00 · 2025-05-12 14:32:01 -04:00 · 2025-05-12 13:32:24 -04:00 · 2025-05-11 04:58:00 -04:00 · 2025-05-10 22:10:58 -04:00 · 2025-05-10 20:40:02 -04:00
45 changed files with 20428 additions and 371 deletions
--- a/README.md
+++ b/README.md
@@ -69,9 +69,11 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/)
   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
+- Audio Models
+   - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
+   - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - 3D Models
   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -142,6 +142,8 @@ class PerformanceFeature(enum.Enum):

 parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")

+parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
+
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
 parser.add_argument("--windows-standalone-build", action="store_true", help="Windows standalone build: Enable convenient things that most people using the standalone windows build will probably enjoy (like auto opening the page on startup).")
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -1277,6 +1277,7 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
    phi1_fn = lambda t: torch.expm1(t) / t
    phi2_fn = lambda t: (phi1_fn(t) - 1.0) / t

+    old_sigma_down = None
    old_denoised = None
    uncond_denoised = None
    def post_cfg_function(args):
@@ -1304,9 +1305,9 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
                x = x + d * dt
        else:
            # Second order multistep method in https://arxiv.org/pdf/2308.02157
-            t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigma_down), t_fn(sigmas[i - 1])
+            t, t_old, t_next, t_prev = t_fn(sigmas[i]), t_fn(old_sigma_down), t_fn(sigma_down), t_fn(sigmas[i - 1])
            h = t_next - t
-            c2 = (t_prev - t) / h
+            c2 = (t_prev - t_old) / h

            phi1_val, phi2_val = phi1_fn(-h), phi2_fn(-h)
            b1 = torch.nan_to_num(phi1_val - phi2_val / c2, nan=0.0)
@@ -1326,6 +1327,7 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
            old_denoised = uncond_denoised
        else:
            old_denoised = denoised
+        old_sigma_down = sigma_down
    return x

@torch.no_grad()
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -466,3 +466,7 @@ class Hunyuan3Dv2mini(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
    scale_factor = 1.0188137142395404
+
+class ACEAudio(LatentFormat):
+    latent_channels = 8
+    latent_dimensions = 2
--- a/comfy/ldm/ace/attention.py
+++ b/comfy/ldm/ace/attention.py
@@ -0,0 +1,761 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/attention.py
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple, Union, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        kv_heads: Optional[int] = None,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        processor=None,
+        out_dim: int = None,
+        out_context_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+        elementwise_affine: bool = True,
+        is_causal: bool = False,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.is_causal = is_causal
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+
+        self.group_norm = None
+        self.spatial_norm = None
+
+        self.norm_q = None
+        self.norm_k = None
+
+        self.norm_cross = None
+        self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
+            self.to_v = operations.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device)
+        else:
+            self.to_k = None
+            self.to_v = None
+
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
+            self.add_v_proj = operations.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias, dtype=dtype, device=device)
+            if self.context_pre_only is not None:
+                self.add_q_proj = operations.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias, dtype=dtype, device=device)
+        else:
+            self.add_q_proj = None
+            self.add_k_proj = None
+            self.add_v_proj = None
+
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device))
+            self.to_out.append(nn.Dropout(dropout))
+        else:
+            self.to_out = None
+
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device)
+        else:
+            self.to_add_out = None
+
+        self.norm_added_q = None
+        self.norm_added_k = None
+        self.processor = processor
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+
+class CustomLiteLAProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE"""
+
+    def __init__(self):
+        self.kernel_func = nn.ReLU(inplace=False)
+        self.eps = 1e-15
+        self.pad_val = 1.0
+
+    def apply_rotary_emb(
+        self,
+        x: torch.Tensor,
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+        tensors contain rotary embeddings and are returned as real tensors.
+
+        Args:
+            x (`torch.Tensor`):
+                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+        """
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+
+        return out
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        hidden_states_len = hidden_states.shape[1]
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        if encoder_hidden_states is not None:
+            context_input_ndim = encoder_hidden_states.ndim
+            if context_input_ndim == 4:
+                batch_size, channel, height, width = encoder_hidden_states.shape
+                encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        dtype = hidden_states.dtype
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        # `context` projections.
+        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
+        if encoder_hidden_states is not None and has_encoder_hidden_state_proj:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+            # attention
+            if not attn.is_cross_attention:
+                query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+                key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+                value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+            else:
+                query = hidden_states
+                key = encoder_hidden_states
+                value = encoder_hidden_states
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+        key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2)
+        value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1)
+
+        # RoPE需要 [B, H, S, D] 输入
+        # 此时 query是 [B, H, D, S], 需要转成 [B, H, S, D] 才能应用RoPE
+        query = query.permute(0, 1, 3, 2)  # [B, H, S, D]  (从 [B, H, D, S])
+
+        # Apply query and key normalization if needed
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if rotary_freqs_cis is not None:
+            query = self.apply_rotary_emb(query, rotary_freqs_cis)
+            if not attn.is_cross_attention:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis)
+            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
+
+        # 此时 query是 [B, H, S, D]，需要还原成 [B, H, D, S]
+        query = query.permute(0, 1, 3, 2)  # [B, H, D, S]
+
+        if attention_mask is not None:
+            # attention_mask: [B, S] -> [B, 1, S, 1]
+            attention_mask = attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S, 1]
+            query = query * attention_mask.permute(0, 1, 3, 2)  # [B, H, S, D] * [B, 1, S, 1]
+            if not attn.is_cross_attention:
+                key = key * attention_mask  # key: [B, h, S, D] 与 mask [B, 1, S, 1] 相乘
+                value = value * attention_mask.permute(0, 1, 3, 2)  # 如果 value 是 [B, h, D, S]，那么需调整mask以匹配S维度
+
+        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
+            encoder_attention_mask = encoder_attention_mask[:, None, :, None].to(key.dtype)  # [B, 1, S_enc, 1]
+            # 此时 key: [B, h, S_enc, D], value: [B, h, D, S_enc]
+            key = key * encoder_attention_mask  # [B, h, S_enc, D] * [B, 1, S_enc, 1]
+            value = value * encoder_attention_mask.permute(0, 1, 3, 2)  # [B, h, D, S_enc] * [B, 1, 1, S_enc]
+
+        query = self.kernel_func(query)
+        key = self.kernel_func(key)
+
+        query, key, value = query.float(), key.float(), value.float()
+
+        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val)
+
+        vk = torch.matmul(value, key)
+
+        hidden_states = torch.matmul(vk, query)
+
+        if hidden_states.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.float()
+
+        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
+
+        hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
+
+        hidden_states = hidden_states.to(dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = encoder_hidden_states.to(dtype)
+
+        # Split the attention outputs.
+        if encoder_hidden_states is not None and not attn.is_cross_attention and has_encoder_hidden_state_proj:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : hidden_states_len],
+                hidden_states[:, hidden_states_len:],
+            )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if encoder_hidden_states is not None and not attn.context_pre_only and not attn.is_cross_attention and hasattr(attn, "to_add_out"):
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if encoder_hidden_states is not None and context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if torch.get_autocast_gpu_dtype() == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+            if encoder_hidden_states is not None:
+                encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+
+        return hidden_states, encoder_hidden_states
+
+
+class CustomerAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def apply_rotary_emb(
+        self,
+        x: torch.Tensor,
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+        to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+        reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+        tensors contain rotary embeddings and are returned as real tensors.
+
+        Args:
+            x (`torch.Tensor`):
+                Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+            freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+        """
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+
+        return out
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj")
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if rotary_freqs_cis is not None:
+            query = self.apply_rotary_emb(query, rotary_freqs_cis)
+            if not attn.is_cross_attention:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis)
+            elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj:
+                key = self.apply_rotary_emb(key, rotary_freqs_cis_cross)
+
+        if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj:
+            # attention_mask: N x S1
+            # encoder_attention_mask: N x S2
+            # cross attention 整合attention_mask和encoder_attention_mask
+            combined_mask = attention_mask[:, :, None] * encoder_attention_mask[:, None, :]
+            attention_mask = torch.where(combined_mask == 1, 0.0, -torch.inf)
+            attention_mask = attention_mask[:, None, :, :].expand(-1, attn.heads, -1, -1).to(query.dtype)
+
+        elif not attn.is_cross_attention and attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        hidden_states = optimized_attention(
+            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True,
+        ).to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+def val2list(x: list or tuple or any, repeat_time=1) -> list:  # type: ignore
+    """Repeat `val` for `repeat_time` times and return the list or val if list/tuple."""
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    return [x for _ in range(repeat_time)]
+
+
+def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple:  # type: ignore
+    """Return tuple with min_len by repeating element at idx_repeat."""
+    # convert to list first
+    x = val2list(x)
+
+    # repeat elements if necessary
+    if len(x) > 0:
+        x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))]
+
+    return tuple(x)
+
+
+def t2i_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+
+
+def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]:
+    if isinstance(kernel_size, tuple):
+        return tuple([get_same_padding(ks) for ks in kernel_size])
+    else:
+        assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number"
+        return kernel_size // 2
+
+class ConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        kernel_size=3,
+        stride=1,
+        dilation=1,
+        groups=1,
+        padding: Union[int, None] = None,
+        use_bias=False,
+        norm=None,
+        act=None,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        if padding is None:
+            padding = get_same_padding(kernel_size)
+            padding *= dilation
+
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.padding = padding
+        self.use_bias = use_bias
+
+        self.conv = operations.Conv1d(
+            in_dim,
+            out_dim,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=use_bias,
+            device=device,
+            dtype=dtype
+        )
+        if norm is not None:
+            self.norm = operations.RMSNorm(out_dim, elementwise_affine=False, dtype=dtype, device=device)
+        else:
+            self.norm = None
+        if act is not None:
+            self.act = nn.SiLU(inplace=True)
+        else:
+            self.act = None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        if self.norm:
+            x = self.norm(x)
+        if self.act:
+            x = self.act(x)
+        return x
+
+
+class GLUMBConv(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        out_feature=None,
+        kernel_size=3,
+        stride=1,
+        padding: Union[int, None] = None,
+        use_bias=False,
+        norm=(None, None, None),
+        act=("silu", "silu", None),
+        dilation=1,
+        dtype=None, device=None, operations=None
+    ):
+        out_feature = out_feature or in_features
+        super().__init__()
+        use_bias = val2tuple(use_bias, 3)
+        norm = val2tuple(norm, 3)
+        act = val2tuple(act, 3)
+
+        self.glu_act = nn.SiLU(inplace=False)
+        self.inverted_conv = ConvLayer(
+            in_features,
+            hidden_features * 2,
+            1,
+            use_bias=use_bias[0],
+            norm=norm[0],
+            act=act[0],
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.depth_conv = ConvLayer(
+            hidden_features * 2,
+            hidden_features * 2,
+            kernel_size,
+            stride=stride,
+            groups=hidden_features * 2,
+            padding=padding,
+            use_bias=use_bias[1],
+            norm=norm[1],
+            act=None,
+            dilation=dilation,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.point_conv = ConvLayer(
+            hidden_features,
+            out_feature,
+            1,
+            use_bias=use_bias[2],
+            norm=norm[2],
+            act=act[2],
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.transpose(1, 2)
+        x = self.inverted_conv(x)
+        x = self.depth_conv(x)
+
+        x, gate = torch.chunk(x, 2, dim=1)
+        gate = self.glu_act(gate)
+        x = x * gate
+
+        x = self.point_conv(x)
+        x = x.transpose(1, 2)
+
+        return x
+
+
+class LinearTransformerBlock(nn.Module):
+    """
+    A Sana block with global shared adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(
+        self,
+        dim,
+        num_attention_heads,
+        attention_head_dim,
+        use_adaln_single=True,
+        cross_attention_dim=None,
+        added_kv_proj_dim=None,
+        context_pre_only=False,
+        mlp_ratio=4.0,
+        add_cross_attention=False,
+        add_cross_attention_dim=None,
+        qk_norm=None,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+
+        self.norm1 = operations.RMSNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            added_kv_proj_dim=added_kv_proj_dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            qk_norm=qk_norm,
+            processor=CustomLiteLAProcessor2_0(),
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        self.add_cross_attention = add_cross_attention
+        self.context_pre_only = context_pre_only
+
+        if add_cross_attention and add_cross_attention_dim is not None:
+            self.cross_attn = Attention(
+                query_dim=dim,
+                cross_attention_dim=add_cross_attention_dim,
+                added_kv_proj_dim=add_cross_attention_dim,
+                dim_head=attention_head_dim,
+                heads=num_attention_heads,
+                out_dim=dim,
+                context_pre_only=context_pre_only,
+                bias=True,
+                qk_norm=qk_norm,
+                processor=CustomerAttnProcessor2_0(),
+                dtype=dtype,
+                device=device,
+                operations=operations,
+            )
+
+        self.norm2 = operations.RMSNorm(dim, 1e-06, elementwise_affine=False)
+
+        self.ff = GLUMBConv(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            use_bias=(True, True, False),
+            norm=(None, None, None),
+            act=("silu", "silu", None),
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+        self.use_adaln_single = use_adaln_single
+        if use_adaln_single:
+            self.scale_shift_table = nn.Parameter(torch.empty(6, dim, dtype=dtype, device=device))
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: torch.FloatTensor = None,
+        encoder_attention_mask: torch.FloatTensor = None,
+        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        temb: torch.FloatTensor = None,
+    ):
+
+        N = hidden_states.shape[0]
+
+        # step 1: AdaLN single
+        if self.use_adaln_single:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                comfy.model_management.cast_to(self.scale_shift_table[None], dtype=temb.dtype, device=temb.device) + temb.reshape(N, 6, -1)
+            ).chunk(6, dim=1)
+
+        norm_hidden_states = self.norm1(hidden_states)
+        if self.use_adaln_single:
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+
+        # step 2: attention
+        if not self.add_cross_attention:
+            attn_output, encoder_hidden_states = self.attn(
+                hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
+            )
+        else:
+            attn_output, _ = self.attn(
+                hidden_states=norm_hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=None,
+            )
+
+        if self.use_adaln_single:
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+
+        if self.add_cross_attention:
+            attn_output = self.cross_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
+            )
+            hidden_states = attn_output + hidden_states
+
+        # step 3: add norm
+        norm_hidden_states = self.norm2(hidden_states)
+        if self.use_adaln_single:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        # step 4: feed forward
+        ff_output = self.ff(norm_hidden_states)
+        if self.use_adaln_single:
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = hidden_states + ff_output
+
+        return hidden_states
--- a/comfy/ldm/ace/lyric_encoder.py
+++ b/comfy/ldm/ace/lyric_encoder.py
--- a/comfy/ldm/ace/model.py
+++ b/comfy/ldm/ace/model.py
@@ -0,0 +1,385 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/models/ace_step_transformer.py
+
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, List, Union
+
+import torch
+from torch import nn
+
+import comfy.model_management
+
+from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
+from .attention import LinearTransformerBlock, t2i_modulate
+from .lyric_encoder import ConformerEncoder as LyricEncoder
+
+
+def cross_norm(hidden_states, controlnet_input):
+    # input N x T x c
+    mean_hidden_states, std_hidden_states = hidden_states.mean(dim=(1,2), keepdim=True), hidden_states.std(dim=(1,2), keepdim=True)
+    mean_controlnet_input, std_controlnet_input = controlnet_input.mean(dim=(1,2), keepdim=True), controlnet_input.std(dim=(1,2), keepdim=True)
+    controlnet_input = (controlnet_input - mean_controlnet_input) * (std_hidden_states / (std_controlnet_input + 1e-12)) + mean_hidden_states
+    return controlnet_input
+
+
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, dtype=None, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=device).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.float32
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class T2IFinalLayer(nn.Module):
+    """
+    The final layer of Sana.
+    """
+
+    def __init__(self, hidden_size, patch_size=[16, 1], out_channels=256, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, patch_size[0] * patch_size[1] * out_channels, bias=True, dtype=dtype, device=device)
+        self.scale_shift_table = nn.Parameter(torch.empty(2, hidden_size, dtype=dtype, device=device))
+        self.out_channels = out_channels
+        self.patch_size = patch_size
+
+    def unpatchfy(
+        self,
+        hidden_states: torch.Tensor,
+        width: int,
+    ):
+        # 4 unpatchify
+        new_height, new_width = 1, hidden_states.size(1)
+        hidden_states = hidden_states.reshape(
+            shape=(hidden_states.shape[0], new_height, new_width, self.patch_size[0], self.patch_size[1], self.out_channels)
+        ).contiguous()
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(hidden_states.shape[0], self.out_channels, new_height * self.patch_size[0], new_width * self.patch_size[1])
+        ).contiguous()
+        if width > new_width:
+            output = torch.nn.functional.pad(output, (0, width - new_width, 0, 0), 'constant', 0)
+        elif width < new_width:
+            output = output[:, :, :, :width]
+        return output
+
+    def forward(self, x, t, output_length):
+        shift, scale = (comfy.model_management.cast_to(self.scale_shift_table[None], device=t.device, dtype=t.dtype) + t[:, None]).chunk(2, dim=1)
+        x = t2i_modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        # unpatchify
+        output = self.unpatchfy(x, output_length)
+        return output
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        height=16,
+        width=4096,
+        patch_size=(16, 1),
+        in_channels=8,
+        embed_dim=1152,
+        bias=True,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        patch_size_h, patch_size_w = patch_size
+        self.early_conv_layers = nn.Sequential(
+            operations.Conv2d(in_channels, in_channels*256, kernel_size=patch_size, stride=patch_size, padding=0, bias=bias, dtype=dtype, device=device),
+            operations.GroupNorm(num_groups=32, num_channels=in_channels*256, eps=1e-6, affine=True, dtype=dtype, device=device),
+            operations.Conv2d(in_channels*256, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias, dtype=dtype, device=device)
+        )
+        self.patch_size = patch_size
+        self.height, self.width = height // patch_size_h, width // patch_size_w
+        self.base_size = self.width
+
+    def forward(self, latent):
+        # early convolutions, N x C x H x W -> N x 256 * sqrt(patch_size) x H/patch_size x W/patch_size
+        latent = self.early_conv_layers(latent)
+        latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        return latent
+
+
+class ACEStepTransformer2DModel(nn.Module):
+    # _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: Optional[int] = 8,
+        num_layers: int = 28,
+        inner_dim: int = 1536,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 24,
+        mlp_ratio: float = 4.0,
+        out_channels: int = 8,
+        max_position: int = 32768,
+        rope_theta: float = 1000000.0,
+        speaker_embedding_dim: int = 512,
+        text_embedding_dim: int = 768,
+        ssl_encoder_depths: List[int] = [9, 9],
+        ssl_names: List[str] = ["mert", "m-hubert"],
+        ssl_latent_dims: List[int] = [1024, 768],
+        lyric_encoder_vocab_size: int = 6681,
+        lyric_hidden_size: int = 1024,
+        patch_size: List[int] = [16, 1],
+        max_height: int = 16,
+        max_width: int = 4096,
+        audio_model=None,
+        dtype=None, device=None, operations=None
+
+    ):
+        super().__init__()
+
+        self.dtype = dtype
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+        self.out_channels = out_channels
+        self.max_position = max_position
+        self.patch_size = patch_size
+
+        self.rope_theta = rope_theta
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            dim=self.attention_head_dim,
+            max_position_embeddings=self.max_position,
+            base=self.rope_theta,
+            dtype=dtype,
+            device=device,
+        )
+
+        # 2. Define input layers
+        self.in_channels = in_channels
+
+        self.num_layers = num_layers
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                LinearTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    add_cross_attention=True,
+                    add_cross_attention_dim=self.inner_dim,
+                    dtype=dtype,
+                    device=device,
+                    operations=operations,
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim, dtype=dtype, device=device, operations=operations)
+        self.t_block = nn.Sequential(nn.SiLU(), operations.Linear(self.inner_dim, 6 * self.inner_dim, bias=True, dtype=dtype, device=device))
+
+        # speaker
+        self.speaker_embedder = operations.Linear(speaker_embedding_dim, self.inner_dim, dtype=dtype, device=device)
+
+        # genre
+        self.genre_embedder = operations.Linear(text_embedding_dim, self.inner_dim, dtype=dtype, device=device)
+
+        # lyric
+        self.lyric_embs = operations.Embedding(lyric_encoder_vocab_size, lyric_hidden_size, dtype=dtype, device=device)
+        self.lyric_encoder = LyricEncoder(input_size=lyric_hidden_size, static_chunk_size=0, dtype=dtype, device=device, operations=operations)
+        self.lyric_proj = operations.Linear(lyric_hidden_size, self.inner_dim, dtype=dtype, device=device)
+
+        projector_dim = 2 * self.inner_dim
+
+        self.projectors = nn.ModuleList([
+            nn.Sequential(
+                operations.Linear(self.inner_dim, projector_dim, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Linear(projector_dim, projector_dim, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Linear(projector_dim, ssl_dim, dtype=dtype, device=device),
+            ) for ssl_dim in ssl_latent_dims
+        ])
+
+        self.proj_in = PatchEmbed(
+            height=max_height,
+            width=max_width,
+            patch_size=patch_size,
+            embed_dim=self.inner_dim,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            operations=operations,
+        )
+
+        self.final_layer = T2IFinalLayer(self.inner_dim, patch_size=patch_size, out_channels=out_channels, dtype=dtype, device=device, operations=operations)
+
+    def forward_lyric_encoder(
+        self,
+        lyric_token_idx: Optional[torch.LongTensor] = None,
+        lyric_mask: Optional[torch.LongTensor] = None,
+        out_dtype=None,
+    ):
+        # N x T x D
+        lyric_embs = self.lyric_embs(lyric_token_idx, out_dtype=out_dtype)
+        prompt_prenet_out, _mask = self.lyric_encoder(lyric_embs, lyric_mask, decoding_chunk_size=1, num_decoding_left_chunks=-1)
+        prompt_prenet_out = self.lyric_proj(prompt_prenet_out)
+        return prompt_prenet_out
+
+    def encode(
+        self,
+        encoder_text_hidden_states: Optional[torch.Tensor] = None,
+        text_attention_mask: Optional[torch.LongTensor] = None,
+        speaker_embeds: Optional[torch.FloatTensor] = None,
+        lyric_token_idx: Optional[torch.LongTensor] = None,
+        lyric_mask: Optional[torch.LongTensor] = None,
+        lyrics_strength=1.0,
+    ):
+
+        bs = encoder_text_hidden_states.shape[0]
+        device = encoder_text_hidden_states.device
+
+        # speaker embedding
+        encoder_spk_hidden_states = self.speaker_embedder(speaker_embeds).unsqueeze(1)
+
+        # genre embedding
+        encoder_text_hidden_states = self.genre_embedder(encoder_text_hidden_states)
+
+        # lyric
+        encoder_lyric_hidden_states = self.forward_lyric_encoder(
+            lyric_token_idx=lyric_token_idx,
+            lyric_mask=lyric_mask,
+            out_dtype=encoder_text_hidden_states.dtype,
+        )
+
+        encoder_lyric_hidden_states *= lyrics_strength
+
+        encoder_hidden_states = torch.cat([encoder_spk_hidden_states, encoder_text_hidden_states, encoder_lyric_hidden_states], dim=1)
+
+        encoder_hidden_mask = None
+        if text_attention_mask is not None:
+            speaker_mask = torch.ones(bs, 1, device=device)
+            encoder_hidden_mask = torch.cat([speaker_mask, text_attention_mask, lyric_mask], dim=1)
+
+        return encoder_hidden_states, encoder_hidden_mask
+
+    def decode(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_mask: torch.Tensor,
+        timestep: Optional[torch.Tensor],
+        output_length: int = 0,
+        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
+        controlnet_scale: Union[float, torch.Tensor] = 1.0,
+    ):
+        embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
+        temb = self.t_block(embedded_timestep)
+
+        hidden_states = self.proj_in(hidden_states)
+
+        # controlnet logic
+        if block_controlnet_hidden_states is not None:
+            control_condi = cross_norm(hidden_states, block_controlnet_hidden_states)
+            hidden_states = hidden_states + control_condi * controlnet_scale
+
+        # inner_hidden_states = []
+
+        rotary_freqs_cis = self.rotary_emb(hidden_states, seq_len=hidden_states.shape[1])
+        encoder_rotary_freqs_cis = self.rotary_emb(encoder_hidden_states, seq_len=encoder_hidden_states.shape[1])
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            hidden_states = block(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_hidden_mask,
+                rotary_freqs_cis=rotary_freqs_cis,
+                rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
+                temb=temb,
+            )
+
+        output = self.final_layer(hidden_states, embedded_timestep, output_length)
+        return output
+
+    def forward(
+        self,
+        x,
+        timestep,
+        attention_mask=None,
+        context: Optional[torch.Tensor] = None,
+        text_attention_mask: Optional[torch.LongTensor] = None,
+        speaker_embeds: Optional[torch.FloatTensor] = None,
+        lyric_token_idx: Optional[torch.LongTensor] = None,
+        lyric_mask: Optional[torch.LongTensor] = None,
+        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
+        controlnet_scale: Union[float, torch.Tensor] = 1.0,
+        lyrics_strength=1.0,
+        **kwargs
+    ):
+        hidden_states = x
+        encoder_text_hidden_states = context
+        encoder_hidden_states, encoder_hidden_mask = self.encode(
+            encoder_text_hidden_states=encoder_text_hidden_states,
+            text_attention_mask=text_attention_mask,
+            speaker_embeds=speaker_embeds,
+            lyric_token_idx=lyric_token_idx,
+            lyric_mask=lyric_mask,
+            lyrics_strength=lyrics_strength,
+        )
+
+        output_length = hidden_states.shape[-1]
+
+        output = self.decode(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_hidden_mask=encoder_hidden_mask,
+            timestep=timestep,
+            output_length=output_length,
+            block_controlnet_hidden_states=block_controlnet_hidden_states,
+            controlnet_scale=controlnet_scale,
+        )
+
+        return output
--- a/comfy/ldm/ace/vae/autoencoder_dc.py
+++ b/comfy/ldm/ace/vae/autoencoder_dc.py
@@ -0,0 +1,644 @@
+# Rewritten from diffusers
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Union
+
+import comfy.model_management
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+
+class RMSNorm(ops.RMSNorm):
+    def __init__(self, dim, eps=1e-5, elementwise_affine=True, bias=False):
+        super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine)
+        if elementwise_affine:
+            self.bias = nn.Parameter(torch.empty(dim)) if bias else None
+
+    def forward(self, x):
+        x = super().forward(x)
+        if self.elementwise_affine:
+            if self.bias is not None:
+                x = x + comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device)
+        return x
+
+
+def get_normalization(norm_type, num_features, num_groups=32, eps=1e-5):
+    if norm_type == "batch_norm":
+        return nn.BatchNorm2d(num_features)
+    elif norm_type == "group_norm":
+        return ops.GroupNorm(num_groups, num_features)
+    elif norm_type == "layer_norm":
+        return ops.LayerNorm(num_features)
+    elif norm_type == "rms_norm":
+        return RMSNorm(num_features, eps=eps, elementwise_affine=True, bias=True)
+    else:
+        raise ValueError(f"Unknown normalization type: {norm_type}")
+
+
+def get_activation(activation_type):
+    if activation_type == "relu":
+        return nn.ReLU()
+    elif activation_type == "relu6":
+        return nn.ReLU6()
+    elif activation_type == "silu":
+        return nn.SiLU()
+    elif activation_type == "leaky_relu":
+        return nn.LeakyReLU(0.2)
+    else:
+        raise ValueError(f"Unknown activation type: {activation_type}")
+
+
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_type: str = "batch_norm",
+        act_fn: str = "relu6",
+    ) -> None:
+        super().__init__()
+
+        self.norm_type = norm_type
+        self.nonlinearity = get_activation(act_fn) if act_fn is not None else nn.Identity()
+        self.conv1 = ops.Conv2d(in_channels, in_channels, 3, 1, 1)
+        self.conv2 = ops.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False)
+        self.norm = get_normalization(norm_type, out_channels)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.norm_type == "rms_norm":
+            # move channel to the last dimension so we apply RMSnorm across channel dimension
+            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states + residual
+
+class SanaMultiscaleAttentionProjection(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_attention_heads: int,
+        kernel_size: int,
+    ) -> None:
+        super().__init__()
+
+        channels = 3 * in_channels
+        self.proj_in = ops.Conv2d(
+            channels,
+            channels,
+            kernel_size,
+            padding=kernel_size // 2,
+            groups=channels,
+            bias=False,
+        )
+        self.proj_out = ops.Conv2d(channels, channels, 1, 1, 0, groups=3 * num_attention_heads, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj_in(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        return hidden_states
+
+class SanaMultiscaleLinearAttention(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_attention_heads: int = None,
+        attention_head_dim: int = 8,
+        mult: float = 1.0,
+        norm_type: str = "batch_norm",
+        kernel_sizes: tuple = (5,),
+        eps: float = 1e-15,
+        residual_connection: bool = False,
+    ):
+        super().__init__()
+
+        self.eps = eps
+        self.attention_head_dim = attention_head_dim
+        self.norm_type = norm_type
+        self.residual_connection = residual_connection
+
+        num_attention_heads = (
+            int(in_channels // attention_head_dim * mult)
+            if num_attention_heads is None
+            else num_attention_heads
+        )
+        inner_dim = num_attention_heads * attention_head_dim
+
+        self.to_q = ops.Linear(in_channels, inner_dim, bias=False)
+        self.to_k = ops.Linear(in_channels, inner_dim, bias=False)
+        self.to_v = ops.Linear(in_channels, inner_dim, bias=False)
+
+        self.to_qkv_multiscale = nn.ModuleList()
+        for kernel_size in kernel_sizes:
+            self.to_qkv_multiscale.append(
+                SanaMultiscaleAttentionProjection(inner_dim, num_attention_heads, kernel_size)
+            )
+
+        self.nonlinearity = nn.ReLU()
+        self.to_out = ops.Linear(inner_dim * (1 + len(kernel_sizes)), out_channels, bias=False)
+        self.norm_out = get_normalization(norm_type, out_channels)
+
+    def apply_linear_attention(self, query, key, value):
+        value = F.pad(value, (0, 0, 0, 1), mode="constant", value=1)
+        scores = torch.matmul(value, key.transpose(-1, -2))
+        hidden_states = torch.matmul(scores, query)
+
+        hidden_states = hidden_states.to(dtype=torch.float32)
+        hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
+        return hidden_states
+
+    def apply_quadratic_attention(self, query, key, value):
+        scores = torch.matmul(key.transpose(-1, -2), query)
+        scores = scores.to(dtype=torch.float32)
+        scores = scores / (torch.sum(scores, dim=2, keepdim=True) + self.eps)
+        hidden_states = torch.matmul(value, scores.to(value.dtype))
+        return hidden_states
+
+    def forward(self, hidden_states):
+        height, width = hidden_states.shape[-2:]
+        if height * width > self.attention_head_dim:
+            use_linear_attention = True
+        else:
+            use_linear_attention = False
+
+        residual = hidden_states
+
+        batch_size, _, height, width = list(hidden_states.size())
+        original_dtype = hidden_states.dtype
+
+        hidden_states = hidden_states.movedim(1, -1)
+        query = self.to_q(hidden_states)
+        key = self.to_k(hidden_states)
+        value = self.to_v(hidden_states)
+        hidden_states = torch.cat([query, key, value], dim=3)
+        hidden_states = hidden_states.movedim(-1, 1)
+
+        multi_scale_qkv = [hidden_states]
+        for block in self.to_qkv_multiscale:
+            multi_scale_qkv.append(block(hidden_states))
+
+        hidden_states = torch.cat(multi_scale_qkv, dim=1)
+
+        if use_linear_attention:
+            # for linear attention upcast hidden_states to float32
+            hidden_states = hidden_states.to(dtype=torch.float32)
+
+        hidden_states = hidden_states.reshape(batch_size, -1, 3 * self.attention_head_dim, height * width)
+
+        query, key, value = hidden_states.chunk(3, dim=2)
+        query = self.nonlinearity(query)
+        key = self.nonlinearity(key)
+
+        if use_linear_attention:
+            hidden_states = self.apply_linear_attention(query, key, value)
+            hidden_states = hidden_states.to(dtype=original_dtype)
+        else:
+            hidden_states = self.apply_quadratic_attention(query, key, value)
+
+        hidden_states = torch.reshape(hidden_states, (batch_size, -1, height, width))
+        hidden_states = self.to_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
+
+        if self.norm_type == "rms_norm":
+            hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
+        else:
+            hidden_states = self.norm_out(hidden_states)
+
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class EfficientViTBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        mult: float = 1.0,
+        attention_head_dim: int = 32,
+        qkv_multiscales: tuple = (5,),
+        norm_type: str = "batch_norm",
+    ) -> None:
+        super().__init__()
+
+        self.attn = SanaMultiscaleLinearAttention(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            mult=mult,
+            attention_head_dim=attention_head_dim,
+            norm_type=norm_type,
+            kernel_sizes=qkv_multiscales,
+            residual_connection=True,
+        )
+
+        self.conv_out = GLUMBConv(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            norm_type="rms_norm",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.attn(x)
+        x = self.conv_out(x)
+        return x
+
+
+class GLUMBConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        expand_ratio: float = 4,
+        norm_type: str = None,
+        residual_connection: bool = True,
+    ) -> None:
+        super().__init__()
+
+        hidden_channels = int(expand_ratio * in_channels)
+        self.norm_type = norm_type
+        self.residual_connection = residual_connection
+
+        self.nonlinearity = nn.SiLU()
+        self.conv_inverted = ops.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
+        self.conv_depth = ops.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
+        self.conv_point = ops.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
+
+        self.norm = None
+        if norm_type == "rms_norm":
+            self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.residual_connection:
+            residual = hidden_states
+
+        hidden_states = self.conv_inverted(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.conv_depth(hidden_states)
+        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
+        hidden_states = hidden_states * self.nonlinearity(gate)
+
+        hidden_states = self.conv_point(hidden_states)
+
+        if self.norm_type == "rms_norm":
+            # move channel to the last dimension so we apply RMSnorm across channel dimension
+            hidden_states = self.norm(hidden_states.movedim(1, -1)).movedim(-1, 1)
+
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+def get_block(
+    block_type: str,
+    in_channels: int,
+    out_channels: int,
+    attention_head_dim: int,
+    norm_type: str,
+    act_fn: str,
+    qkv_mutliscales: tuple = (),
+):
+    if block_type == "ResBlock":
+        block = ResBlock(in_channels, out_channels, norm_type, act_fn)
+    elif block_type == "EfficientViTBlock":
+        block = EfficientViTBlock(
+            in_channels,
+            attention_head_dim=attention_head_dim,
+            norm_type=norm_type,
+            qkv_multiscales=qkv_mutliscales
+        )
+    else:
+        raise ValueError(f"Block with {block_type=} is not supported.")
+
+    return block
+
+
+class DCDownBlock2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, downsample: bool = False, shortcut: bool = True) -> None:
+        super().__init__()
+
+        self.downsample = downsample
+        self.factor = 2
+        self.stride = 1 if downsample else 2
+        self.group_size = in_channels * self.factor**2 // out_channels
+        self.shortcut = shortcut
+
+        out_ratio = self.factor**2
+        if downsample:
+            assert out_channels % out_ratio == 0
+            out_channels = out_channels // out_ratio
+
+        self.conv = ops.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=self.stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self.conv(hidden_states)
+        if self.downsample:
+            x = F.pixel_unshuffle(x, self.factor)
+
+        if self.shortcut:
+            y = F.pixel_unshuffle(hidden_states, self.factor)
+            y = y.unflatten(1, (-1, self.group_size))
+            y = y.mean(dim=2)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+
+        return hidden_states
+
+
+class DCUpBlock2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        interpolate: bool = False,
+        shortcut: bool = True,
+        interpolation_mode: str = "nearest",
+    ) -> None:
+        super().__init__()
+
+        self.interpolate = interpolate
+        self.interpolation_mode = interpolation_mode
+        self.shortcut = shortcut
+        self.factor = 2
+        self.repeats = out_channels * self.factor**2 // in_channels
+
+        out_ratio = self.factor**2
+        if not interpolate:
+            out_channels = out_channels * out_ratio
+
+        self.conv = ops.Conv2d(in_channels, out_channels, 3, 1, 1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.interpolate:
+            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
+            x = self.conv(x)
+        else:
+            x = self.conv(hidden_states)
+            x = F.pixel_shuffle(x, self.factor)
+
+        if self.shortcut:
+            y = hidden_states.repeat_interleave(self.repeats, dim=1, output_size=hidden_states.shape[1] * self.repeats)
+            y = F.pixel_shuffle(y, self.factor)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+
+        return hidden_states
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        attention_head_dim: int = 32,
+        block_type: str or tuple = "ResBlock",
+        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
+        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
+        downsample_block_type: str = "pixel_unshuffle",
+        out_shortcut: bool = True,
+    ):
+        super().__init__()
+
+        num_blocks = len(block_out_channels)
+
+        if isinstance(block_type, str):
+            block_type = (block_type,) * num_blocks
+
+        if layers_per_block[0] > 0:
+            self.conv_in = ops.Conv2d(
+                in_channels,
+                block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        else:
+            self.conv_in = DCDownBlock2d(
+                in_channels=in_channels,
+                out_channels=block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1],
+                downsample=downsample_block_type == "pixel_unshuffle",
+                shortcut=False,
+            )
+
+        down_blocks = []
+        for i, (out_channel, num_layers) in enumerate(zip(block_out_channels, layers_per_block)):
+            down_block_list = []
+
+            for _ in range(num_layers):
+                block = get_block(
+                    block_type[i],
+                    out_channel,
+                    out_channel,
+                    attention_head_dim=attention_head_dim,
+                    norm_type="rms_norm",
+                    act_fn="silu",
+                    qkv_mutliscales=qkv_multiscales[i],
+                )
+                down_block_list.append(block)
+
+            if i < num_blocks - 1 and num_layers > 0:
+                downsample_block = DCDownBlock2d(
+                    in_channels=out_channel,
+                    out_channels=block_out_channels[i + 1],
+                    downsample=downsample_block_type == "pixel_unshuffle",
+                    shortcut=True,
+                )
+                down_block_list.append(downsample_block)
+
+            down_blocks.append(nn.Sequential(*down_block_list))
+
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+        self.conv_out = ops.Conv2d(block_out_channels[-1], latent_channels, 3, 1, 1)
+
+        self.out_shortcut = out_shortcut
+        if out_shortcut:
+            self.out_shortcut_average_group_size = block_out_channels[-1] // latent_channels
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv_in(hidden_states)
+        for down_block in self.down_blocks:
+            hidden_states = down_block(hidden_states)
+
+        if self.out_shortcut:
+            x = hidden_states.unflatten(1, (-1, self.out_shortcut_average_group_size))
+            x = x.mean(dim=2)
+            hidden_states = self.conv_out(hidden_states) + x
+        else:
+            hidden_states = self.conv_out(hidden_states)
+
+        return hidden_states
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        attention_head_dim: int = 32,
+        block_type: str or tuple = "ResBlock",
+        block_out_channels: tuple = (128, 256, 512, 512, 1024, 1024),
+        layers_per_block: tuple = (2, 2, 2, 2, 2, 2),
+        qkv_multiscales: tuple = ((), (), (), (5,), (5,), (5,)),
+        norm_type: str or tuple = "rms_norm",
+        act_fn: str or tuple = "silu",
+        upsample_block_type: str = "pixel_shuffle",
+        in_shortcut: bool = True,
+    ):
+        super().__init__()
+
+        num_blocks = len(block_out_channels)
+
+        if isinstance(block_type, str):
+            block_type = (block_type,) * num_blocks
+        if isinstance(norm_type, str):
+            norm_type = (norm_type,) * num_blocks
+        if isinstance(act_fn, str):
+            act_fn = (act_fn,) * num_blocks
+
+        self.conv_in = ops.Conv2d(latent_channels, block_out_channels[-1], 3, 1, 1)
+
+        self.in_shortcut = in_shortcut
+        if in_shortcut:
+            self.in_shortcut_repeats = block_out_channels[-1] // latent_channels
+
+        up_blocks = []
+        for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))):
+            up_block_list = []
+
+            if i < num_blocks - 1 and num_layers > 0:
+                upsample_block = DCUpBlock2d(
+                    block_out_channels[i + 1],
+                    out_channel,
+                    interpolate=upsample_block_type == "interpolate",
+                    shortcut=True,
+                )
+                up_block_list.append(upsample_block)
+
+            for _ in range(num_layers):
+                block = get_block(
+                    block_type[i],
+                    out_channel,
+                    out_channel,
+                    attention_head_dim=attention_head_dim,
+                    norm_type=norm_type[i],
+                    act_fn=act_fn[i],
+                    qkv_mutliscales=qkv_multiscales[i],
+                )
+                up_block_list.append(block)
+
+            up_blocks.insert(0, nn.Sequential(*up_block_list))
+
+        self.up_blocks = nn.ModuleList(up_blocks)
+
+        channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
+
+        self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
+        self.conv_act = nn.ReLU()
+        self.conv_out = None
+
+        if layers_per_block[0] > 0:
+            self.conv_out = ops.Conv2d(channels, in_channels, 3, 1, 1)
+        else:
+            self.conv_out = DCUpBlock2d(
+                channels, in_channels, interpolate=upsample_block_type == "interpolate", shortcut=False
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.in_shortcut:
+            x = hidden_states.repeat_interleave(
+                self.in_shortcut_repeats, dim=1, output_size=hidden_states.shape[1] * self.in_shortcut_repeats
+            )
+            hidden_states = self.conv_in(hidden_states) + x
+        else:
+            hidden_states = self.conv_in(hidden_states)
+
+        for up_block in reversed(self.up_blocks):
+            hidden_states = up_block(hidden_states)
+
+        hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+
+
+class AutoencoderDC(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 2,
+        latent_channels: int = 8,
+        attention_head_dim: int = 32,
+        encoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
+        decoder_block_types: Union[str, Tuple[str]] = ["ResBlock", "ResBlock", "ResBlock", "EfficientViTBlock"],
+        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
+        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024),
+        encoder_layers_per_block: Tuple[int] = (2, 2, 3, 3),
+        decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3),
+        encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
+        decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (5,), (5,)),
+        upsample_block_type: str = "interpolate",
+        downsample_block_type: str = "Conv",
+        decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
+        decoder_act_fns: Union[str, Tuple[str]] = "silu",
+        scaling_factor: float = 0.41407,
+    ) -> None:
+        super().__init__()
+
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            latent_channels=latent_channels,
+            attention_head_dim=attention_head_dim,
+            block_type=encoder_block_types,
+            block_out_channels=encoder_block_out_channels,
+            layers_per_block=encoder_layers_per_block,
+            qkv_multiscales=encoder_qkv_multiscales,
+            downsample_block_type=downsample_block_type,
+        )
+
+        self.decoder = Decoder(
+            in_channels=in_channels,
+            latent_channels=latent_channels,
+            attention_head_dim=attention_head_dim,
+            block_type=decoder_block_types,
+            block_out_channels=decoder_block_out_channels,
+            layers_per_block=decoder_layers_per_block,
+            qkv_multiscales=decoder_qkv_multiscales,
+            norm_type=decoder_norm_types,
+            act_fn=decoder_act_fns,
+            upsample_block_type=upsample_block_type,
+        )
+
+        self.scaling_factor = scaling_factor
+        self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)
+
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Internal encoding function."""
+        encoded = self.encoder(x)
+        return encoded * self.scaling_factor
+
+    def decode(self, z: torch.Tensor) -> torch.Tensor:
+        # Scale the latents back
+        z = z / self.scaling_factor
+        decoded = self.decoder(z)
+        return decoded
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = self.encode(x)
+        return self.decode(z)
+
--- a/comfy/ldm/ace/vae/music_dcae_pipeline.py
+++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py
@@ -0,0 +1,109 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_dcae_pipeline.py
+import torch
+from .autoencoder_dc import AutoencoderDC
+import logging
+try:
+    import torchaudio
+except:
+    logging.warning("torchaudio missing, ACE model will be broken")
+
+import torchvision.transforms as transforms
+from .music_vocoder import ADaMoSHiFiGANV1
+
+
+class MusicDCAE(torch.nn.Module):
+    def __init__(self, source_sample_rate=None, dcae_config={}, vocoder_config={}):
+        super(MusicDCAE, self).__init__()
+
+        self.dcae = AutoencoderDC(**dcae_config)
+        self.vocoder = ADaMoSHiFiGANV1(**vocoder_config)
+
+        if source_sample_rate is None:
+            self.source_sample_rate = 48000
+        else:
+            self.source_sample_rate = source_sample_rate
+
+        # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
+
+        self.transform = transforms.Compose([
+            transforms.Normalize(0.5, 0.5),
+        ])
+        self.min_mel_value = -11.0
+        self.max_mel_value = 3.0
+        self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000)))
+        self.mel_chunk_size = 1024
+        self.time_dimention_multiple = 8
+        self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple
+        self.scale_factor = 0.1786
+        self.shift_factor = -1.9091
+
+    def load_audio(self, audio_path):
+        audio, sr = torchaudio.load(audio_path)
+        return audio, sr
+
+    def forward_mel(self, audios):
+        mels = []
+        for i in range(len(audios)):
+            image = self.vocoder.mel_transform(audios[i])
+            mels.append(image)
+        mels = torch.stack(mels)
+        return mels
+
+    @torch.no_grad()
+    def encode(self, audios, audio_lengths=None, sr=None):
+        if audio_lengths is None:
+            audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0])
+            audio_lengths = audio_lengths.to(audios.device)
+
+        if sr is None:
+            sr = self.source_sample_rate
+
+        if sr != 44100:
+            audios = torchaudio.functional.resample(audios, sr, 44100)
+
+        max_audio_len = audios.shape[-1]
+        if max_audio_len % (8 * 512) != 0:
+            audios = torch.nn.functional.pad(audios, (0, 8 * 512 - max_audio_len % (8 * 512)))
+
+        mels = self.forward_mel(audios)
+        mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value)
+        mels = self.transform(mels)
+        latents = []
+        for mel in mels:
+            latent = self.dcae.encoder(mel.unsqueeze(0))
+            latents.append(latent)
+        latents = torch.cat(latents, dim=0)
+        # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
+        latents = (latents - self.shift_factor) * self.scale_factor
+        return latents
+        # return latents, latent_lengths
+
+    @torch.no_grad()
+    def decode(self, latents, audio_lengths=None, sr=None):
+        latents = latents / self.scale_factor + self.shift_factor
+
+        pred_wavs = []
+
+        for latent in latents:
+            mels = self.dcae.decoder(latent.unsqueeze(0))
+            mels = mels * 0.5 + 0.5
+            mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value
+            wav = self.vocoder.decode(mels[0]).squeeze(1)
+
+            if sr is not None:
+                # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
+                wav = torchaudio.functional.resample(wav, 44100, sr)
+                # wav = resampler(wav)
+            else:
+                sr = 44100
+            pred_wavs.append(wav)
+
+        if audio_lengths is not None:
+            pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
+        return torch.stack(pred_wavs)
+        # return sr, pred_wavs
+
+    def forward(self, audios, audio_lengths=None, sr=None):
+        latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)
+        sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr)
+        return sr, pred_wavs, latents, latent_lengths
--- a/comfy/ldm/ace/vae/music_log_mel.py
+++ b/comfy/ldm/ace/vae/music_log_mel.py
@@ -0,0 +1,113 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_log_mel.py
+import torch
+import torch.nn as nn
+from torch import Tensor
+import logging
+try:
+    from torchaudio.transforms import MelScale
+except:
+    logging.warning("torchaudio missing, ACE model will be broken")
+
+import comfy.model_management
+
+class LinearSpectrogram(nn.Module):
+    def __init__(
+        self,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        center=False,
+        mode="pow2_sqrt",
+    ):
+        super().__init__()
+
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.mode = mode
+
+        self.register_buffer("window", torch.hann_window(win_length))
+
+    def forward(self, y: Tensor) -> Tensor:
+        if y.ndim == 3:
+            y = y.squeeze(1)
+
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (
+                (self.win_length - self.hop_length) // 2,
+                (self.win_length - self.hop_length + 1) // 2,
+            ),
+            mode="reflect",
+        ).squeeze(1)
+        dtype = y.dtype
+        spec = torch.stft(
+            y.float(),
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=comfy.model_management.cast_to(self.window, dtype=torch.float32, device=y.device),
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+
+        if self.mode == "pow2_sqrt":
+            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+        spec = spec.to(dtype)
+        return spec
+
+
+class LogMelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        sample_rate=44100,
+        n_fft=2048,
+        win_length=2048,
+        hop_length=512,
+        n_mels=128,
+        center=False,
+        f_min=0.0,
+        f_max=None,
+    ):
+        super().__init__()
+
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max or sample_rate // 2
+
+        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)
+        self.mel_scale = MelScale(
+            self.n_mels,
+            self.sample_rate,
+            self.f_min,
+            self.f_max,
+            self.n_fft // 2 + 1,
+            "slaney",
+            "slaney",
+        )
+
+    def compress(self, x: Tensor) -> Tensor:
+        return torch.log(torch.clamp(x, min=1e-5))
+
+    def decompress(self, x: Tensor) -> Tensor:
+        return torch.exp(x)
+
+    def forward(self, x: Tensor, return_linear: bool = False) -> Tensor:
+        linear = self.spectrogram(x)
+        x = self.mel_scale(linear)
+        x = self.compress(x)
+        # print(x.shape)
+        if return_linear:
+            return x, self.compress(linear)
+
+        return x
--- a/comfy/ldm/ace/vae/music_vocoder.py
+++ b/comfy/ldm/ace/vae/music_vocoder.py
@@ -0,0 +1,538 @@
+# Original from: https://github.com/ace-step/ACE-Step/blob/main/music_dcae/music_vocoder.py
+import torch
+from torch import nn
+
+from functools import partial
+from math import prod
+from typing import Callable, Tuple, List
+
+import numpy as np
+import torch.nn.functional as F
+from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm
+
+from .music_log_mel import LogMelSpectrogram
+
+import comfy.model_management
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """  # noqa: E501
+
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""  # noqa: E501
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+
+
+class LayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """  # noqa: E501
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(
+                x, self.normalized_shape, comfy.model_management.cast_to(self.weight, dtype=x.dtype, device=x.device), comfy.model_management.cast_to(self.bias, dtype=x.dtype, device=x.device), self.eps
+            )
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = comfy.model_management.cast_to(self.weight[:, None], dtype=x.dtype, device=x.device) * x + comfy.model_management.cast_to(self.bias[:, None], dtype=x.dtype, device=x.device)
+            return x
+
+
+class ConvNeXtBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
+        dilation (int): Dilation for depthwise conv. Default: 1.
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        dim: int,
+        drop_path: float = 0.0,
+        layer_scale_init_value: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        kernel_size: int = 7,
+        dilation: int = 1,
+    ):
+        super().__init__()
+
+        self.dwconv = ops.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=int(dilation * (kernel_size - 1) / 2),
+            groups=dim,
+        )  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = ops.Linear(
+            dim, int(mlp_ratio * dim)
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = ops.Linear(int(mlp_ratio * dim), dim)
+        self.gamma = (
+            nn.Parameter(torch.empty((dim)), requires_grad=False)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x, apply_residual: bool = True):
+        input = x
+
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+
+        if self.gamma is not None:
+            x = comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device) * x
+
+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
+        x = self.drop_path(x)
+
+        if apply_residual:
+            x = input + x
+
+        return x
+
+
+class ParallelConvNeXtBlock(nn.Module):
+    def __init__(self, kernel_sizes: List[int], *args, **kwargs):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs)
+                for kernel_size in kernel_sizes
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.stack(
+            [block(x, apply_residual=False) for block in self.blocks] + [x],
+            dim=1,
+        ).sum(dim=1)
+
+
+class ConvNeXtEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.0,
+        layer_scale_init_value=1e-6,
+        kernel_sizes: Tuple[int] = (7,),
+    ):
+        super().__init__()
+        assert len(depths) == len(dims)
+
+        self.channel_layers = nn.ModuleList()
+        stem = nn.Sequential(
+            ops.Conv1d(
+                input_channels,
+                dims[0],
+                kernel_size=7,
+                padding=3,
+                padding_mode="replicate",
+            ),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
+        )
+        self.channel_layers.append(stem)
+
+        for i in range(len(depths) - 1):
+            mid_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                ops.Conv1d(dims[i], dims[i + 1], kernel_size=1),
+            )
+            self.channel_layers.append(mid_layer)
+
+        block_fn = (
+            partial(ConvNeXtBlock, kernel_size=kernel_sizes[0])
+            if len(kernel_sizes) == 1
+            else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes)
+        )
+
+        self.stages = nn.ModuleList()
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+
+        cur = 0
+        for i in range(len(depths)):
+            stage = nn.Sequential(
+                *[
+                    block_fn(
+                        dim=dims[i],
+                        drop_path=drop_path_rates[cur + j],
+                        layer_scale_init_value=layer_scale_init_value,
+                    )
+                    for j in range(depths[i])
+                ]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        for channel_layer, stage in zip(self.channel_layers, self.stages):
+            x = channel_layer(x)
+            x = stage(x)
+
+        return self.norm(x)
+
+
+def get_padding(kernel_size, dilation=1):
+    return (kernel_size * dilation - dilation) // 2
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+
+        self.convs1 = nn.ModuleList(
+            [
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+
+        self.convs2 = nn.ModuleList(
+            [
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.silu(x)
+            xt = c1(xt)
+            xt = F.silu(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for conv in self.convs1:
+            remove_weight_norm(conv)
+        for conv in self.convs2:
+            remove_weight_norm(conv)
+
+
+class HiFiGANGenerator(nn.Module):
+    def __init__(
+        self,
+        *,
+        hop_length: int = 512,
+        upsample_rates: Tuple[int] = (8, 8, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = (
+            (1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        num_mels: int = 128,
+        upsample_initial_channel: int = 512,
+        use_template: bool = True,
+        pre_conv_kernel_size: int = 7,
+        post_conv_kernel_size: int = 7,
+        post_activation: Callable = partial(nn.SiLU, inplace=True),
+    ):
+        super().__init__()
+
+        assert (
+            prod(upsample_rates) == hop_length
+        ), f"hop_length must be {prod(upsample_rates)}"
+
+        self.conv_pre = torch.nn.utils.parametrizations.weight_norm(
+            ops.Conv1d(
+                num_mels,
+                upsample_initial_channel,
+                pre_conv_kernel_size,
+                1,
+                padding=get_padding(pre_conv_kernel_size),
+            )
+        )
+
+        self.num_upsamples = len(upsample_rates)
+        self.num_kernels = len(resblock_kernel_sizes)
+
+        self.noise_convs = nn.ModuleList()
+        self.use_template = use_template
+        self.ups = nn.ModuleList()
+
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                torch.nn.utils.parametrizations.weight_norm(
+                    ops.ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+            if not use_template:
+                continue
+
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(
+                    ops.Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(ops.Conv1d(1, c_cur, kernel_size=1))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes):
+                self.resblocks.append(ResBlock1(ch, k, d))
+
+        self.activation_post = post_activation()
+        self.conv_post = torch.nn.utils.parametrizations.weight_norm(
+            ops.Conv1d(
+                ch,
+                1,
+                post_conv_kernel_size,
+                1,
+                padding=get_padding(post_conv_kernel_size),
+            )
+        )
+
+    def forward(self, x, template=None):
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            x = F.silu(x, inplace=True)
+            x = self.ups[i](x)
+
+            if self.use_template:
+                x = x + self.noise_convs[i](template)
+
+            xs = None
+
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+
+            x = xs / self.num_kernels
+
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        for up in self.ups:
+            remove_weight_norm(up)
+        for block in self.resblocks:
+            block.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class ADaMoSHiFiGANV1(nn.Module):
+    def __init__(
+        self,
+        input_channels: int = 128,
+        depths: List[int] = [3, 3, 9, 3],
+        dims: List[int] = [128, 256, 384, 512],
+        drop_path_rate: float = 0.0,
+        kernel_sizes: Tuple[int] = (7,),
+        upsample_rates: Tuple[int] = (4, 4, 2, 2, 2, 2, 2),
+        upsample_kernel_sizes: Tuple[int] = (8, 8, 4, 4, 4, 4, 4),
+        resblock_kernel_sizes: Tuple[int] = (3, 7, 11, 13),
+        resblock_dilation_sizes: Tuple[Tuple[int]] = (
+            (1, 3, 5), (1, 3, 5), (1, 3, 5), (1, 3, 5)),
+        num_mels: int = 512,
+        upsample_initial_channel: int = 1024,
+        use_template: bool = False,
+        pre_conv_kernel_size: int = 13,
+        post_conv_kernel_size: int = 13,
+        sampling_rate: int = 44100,
+        n_fft: int = 2048,
+        win_length: int = 2048,
+        hop_length: int = 512,
+        f_min: int = 40,
+        f_max: int = 16000,
+        n_mels: int = 128,
+    ):
+        super().__init__()
+
+        self.backbone = ConvNeXtEncoder(
+            input_channels=input_channels,
+            depths=depths,
+            dims=dims,
+            drop_path_rate=drop_path_rate,
+            kernel_sizes=kernel_sizes,
+        )
+
+        self.head = HiFiGANGenerator(
+            hop_length=hop_length,
+            upsample_rates=upsample_rates,
+            upsample_kernel_sizes=upsample_kernel_sizes,
+            resblock_kernel_sizes=resblock_kernel_sizes,
+            resblock_dilation_sizes=resblock_dilation_sizes,
+            num_mels=num_mels,
+            upsample_initial_channel=upsample_initial_channel,
+            use_template=use_template,
+            pre_conv_kernel_size=pre_conv_kernel_size,
+            post_conv_kernel_size=post_conv_kernel_size,
+        )
+        self.sampling_rate = sampling_rate
+        self.mel_transform = LogMelSpectrogram(
+            sample_rate=sampling_rate,
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_length,
+            f_min=f_min,
+            f_max=f_max,
+            n_mels=n_mels,
+        )
+        self.eval()
+
+    @torch.no_grad()
+    def decode(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
+
+    @torch.no_grad()
+    def encode(self, x):
+        return self.mel_transform(x)
+
+    def forward(self, mel):
+        y = self.backbone(mel)
+        y = self.head(y)
+        return y
--- a/comfy/ldm/audio/autoencoder.py
+++ b/comfy/ldm/audio/autoencoder.py
@@ -75,16 +75,10 @@ class SnakeBeta(nn.Module):
        return x

 def WNConv1d(*args, **kwargs):
-    try:
    return torch.nn.utils.parametrizations.weight_norm(ops.Conv1d(*args, **kwargs))
-    except:
-        return torch.nn.utils.weight_norm(ops.Conv1d(*args, **kwargs)) #support pytorch 2.1 and older

 def WNConvTranspose1d(*args, **kwargs):
-    try:
    return torch.nn.utils.parametrizations.weight_norm(ops.ConvTranspose1d(*args, **kwargs))
-    except:
-        return torch.nn.utils.weight_norm(ops.ConvTranspose1d(*args, **kwargs)) #support pytorch 2.1 and older

 def get_activation(activation: Literal["elu", "snake", "none"], antialias=False, channels=None) -> nn.Module:
    if activation == "elu":
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -39,6 +39,7 @@ import comfy.ldm.wan.model
 import comfy.ldm.hunyuan3d.model
 import comfy.ldm.hidream.model
 import comfy.ldm.chroma.model
+import comfy.ldm.ace.model

 import comfy.model_management
 import comfy.patcher_extension
@@ -1111,7 +1112,7 @@ class HiDream(BaseModel):
        return out

 class Chroma(Flux):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma)

    def extra_conds(self, **kwargs):
@@ -1121,3 +1122,22 @@ class Chroma(Flux):
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        return out
+
+class ACEStep(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ace.model.ACEStepTransformer2DModel)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        noise = kwargs.get("noise", None)
+
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        conditioning_lyrics = kwargs.get("conditioning_lyrics", None)
+        if cross_attn is not None:
+            out['lyric_token_idx'] = comfy.conds.CONDRegular(conditioning_lyrics)
+        out['speaker_embeds'] = comfy.conds.CONDRegular(torch.zeros(noise.shape[0], 512, device=noise.device, dtype=noise.dtype))
+        out['lyrics_strength'] = comfy.conds.CONDConstant(kwargs.get("lyrics_strength", 1.0))
+        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -222,10 +222,39 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
    if '{}adaln_single.emb.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: #Lightricks ltxv
        dit_config = {}
        dit_config["image_model"] = "ltxv"
+        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
+        shape = state_dict['{}transformer_blocks.0.attn2.to_k.weight'.format(key_prefix)].shape
+        dit_config["attention_head_dim"] = shape[0] // 32
+        dit_config["cross_attention_dim"] = shape[1]
        if metadata is not None and "config" in metadata:
            dit_config.update(json.loads(metadata["config"]).get("transformer", {}))
        return dit_config

+    if '{}genre_embedder.weight'.format(key_prefix) in state_dict_keys: #ACE-Step model
+        dit_config = {}
+        dit_config["audio_model"] = "ace"
+        dit_config["attention_head_dim"] = 128
+        dit_config["in_channels"] = 8
+        dit_config["inner_dim"] = 2560
+        dit_config["max_height"] = 16
+        dit_config["max_position"] = 32768
+        dit_config["max_width"] = 32768
+        dit_config["mlp_ratio"] = 2.5
+        dit_config["num_attention_heads"] = 20
+        dit_config["num_layers"] = 24
+        dit_config["out_channels"] = 8
+        dit_config["patch_size"] = [16, 1]
+        dit_config["rope_theta"] = 1000000.0
+        dit_config["speaker_embedding_dim"] = 512
+        dit_config["text_embedding_dim"] = 768
+
+        dit_config["ssl_encoder_depths"] = [8, 8]
+        dit_config["ssl_latent_dims"] = [1024, 768]
+        dit_config["ssl_names"] = ["mert", "m-hubert"]
+        dit_config["lyric_encoder_vocab_size"] = 6693
+        dit_config["lyric_hidden_size"] = 1024
+        return dit_config
+
    if '{}t_block.1.weight'.format(key_prefix) in state_dict_keys: # PixArt
        patch_size = 2
        dit_config = {}
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -308,10 +308,10 @@ def fp8_linear(self, input):
        if scale_input is None:
            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
            input = torch.clamp(input, min=-448, max=448, out=input)
-            input = input.reshape(-1, input_shape[2]).to(dtype)
+            input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
        else:
            scale_input = scale_input.to(input.device)
-            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype)
+            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous()

        if bias is not None:
            o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -15,6 +15,7 @@ import comfy.ldm.lightricks.vae.causal_video_autoencoder
 import comfy.ldm.cosmos.vae
 import comfy.ldm.wan.vae
 import comfy.ldm.hunyuan3d.vae
+import comfy.ldm.ace.vae.music_dcae_pipeline
 import yaml
 import math

@@ -42,6 +43,7 @@ import comfy.text_encoders.cosmos
 import comfy.text_encoders.lumina2
 import comfy.text_encoders.wan
 import comfy.text_encoders.hidream
+import comfy.text_encoders.ace

 import comfy.model_patcher
 import comfy.lora
@@ -280,6 +282,7 @@ class VAE:

        self.downscale_index_formula = None
        self.upscale_index_formula = None
+        self.extra_1d_channel = None

        if config is None:
            if "decoder.mid.block_1.mix_factor" in sd:
@@ -437,6 +440,20 @@ class VAE:
                ddconfig = {"embed_dim": 64, "num_freqs": 8, "include_pi": False, "heads": 16, "width": 1024, "num_decoder_layers": 16, "qkv_bias": False, "qk_norm": True, "geo_decoder_mlp_expand_ratio": mlp_expand, "geo_decoder_downsample_ratio": downsample_ratio, "geo_decoder_ln_post": ln_post}
                self.first_stage_model = comfy.ldm.hunyuan3d.vae.ShapeVAE(**ddconfig)
                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+            elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio
+                self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100)
+                self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
+                self.latent_channels = 8
+                self.output_channels = 2
+                self.upscale_ratio = 4096
+                self.downscale_ratio = 4096
+                self.latent_dim = 2
+                self.process_output = lambda audio: audio
+                self.process_input = lambda audio: audio
+                self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+                self.disable_offload = True
+                self.extra_1d_channel = 16
            else:
                logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
                self.first_stage_model = None
@@ -495,7 +512,13 @@ class VAE:
        return output

    def decode_tiled_1d(self, samples, tile_x=128, overlap=32):
+        if samples.ndim == 3:
            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
+        else:
+            og_shape = samples.shape
+            samples = samples.reshape((og_shape[0], og_shape[1] * og_shape[2], -1))
+            decode_fn = lambda a: self.first_stage_model.decode(a.reshape((-1, og_shape[1], og_shape[2], a.shape[-1])).to(self.vae_dtype).to(self.device)).float()
+
        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device))

    def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)):
@@ -515,9 +538,24 @@ class VAE:
        samples /= 3.0
        return samples

-    def encode_tiled_1d(self, samples, tile_x=128 * 2048, overlap=32 * 2048):
+    def encode_tiled_1d(self, samples, tile_x=256 * 2048, overlap=64 * 2048):
+        if self.latent_dim == 1:
            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
-        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=(1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device)
+            out_channels = self.latent_channels
+            upscale_amount = 1 / self.downscale_ratio
+        else:
+            extra_channel_size = self.extra_1d_channel
+            out_channels = self.latent_channels * extra_channel_size
+            tile_x = tile_x // extra_channel_size
+            overlap = overlap // extra_channel_size
+            upscale_amount = 1 / self.downscale_ratio
+            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).reshape(1, out_channels, -1).float()
+
+        out = comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=upscale_amount, out_channels=out_channels, output_device=self.output_device)
+        if self.latent_dim == 1:
+            return out
+        else:
+            return out.reshape(samples.shape[0], self.latent_channels, extra_channel_size, -1)

    def encode_tiled_3d(self, samples, tile_t=9999, tile_x=512, tile_y=512, overlap=(1, 64, 64)):
        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
@@ -542,7 +580,7 @@ class VAE:
        except model_management.OOM_EXCEPTION:
            logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
            dims = samples_in.ndim - 2
-            if dims == 1:
+            if dims == 1 or self.extra_1d_channel is not None:
                pixel_samples = self.decode_tiled_1d(samples_in)
            elif dims == 2:
                pixel_samples = self.decode_tiled_(samples_in)
@@ -609,7 +647,7 @@ class VAE:
                tile = 256
                overlap = tile // 4
                samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap))
-            elif self.latent_dim == 1:
+            elif self.latent_dim == 1 or self.extra_1d_channel is not None:
                samples = self.encode_tiled_1d(pixel_samples)
            else:
                samples = self.encode_tiled_(pixel_samples)
@@ -715,6 +753,7 @@ class CLIPType(Enum):
    WAN = 13
    HIDREAM = 14
    CHROMA = 15
+    ACE = 16


 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@@ -840,6 +879,11 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model
            clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer
        elif te_model == TEModel.T5_BASE:
+            if clip_type == CLIPType.ACE or "spiece_model" in clip_data[0]:
+                clip_target.clip = comfy.text_encoders.ace.AceT5Model
+                clip_target.tokenizer = comfy.text_encoders.ace.AceT5Tokenizer
+                tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
+            else:
                clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
                clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
        elif te_model == TEModel.GEMMA_2_2B:
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -17,6 +17,7 @@ import comfy.text_encoders.hunyuan_video
 import comfy.text_encoders.cosmos
 import comfy.text_encoders.lumina2
 import comfy.text_encoders.wan
+import comfy.text_encoders.ace

 from . import supported_models_base
 from . import latent_formats
@@ -785,6 +786,10 @@ class LTXV(supported_models_base.BASE):
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]

+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = (unet_config.get("cross_attention_dim", 2048) / 2048) * 5.5
+
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.LTXV(self, device=device)
        return out
@@ -1096,6 +1101,34 @@ class Chroma(supported_models_base.BASE):
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.pixart_te(**t5_detect))

-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma]
+class ACEStep(supported_models_base.BASE):
+    unet_config = {
+        "audio_model": "ace",
+    }
+
+    unet_extra_config = {
+    }
+
+    sampling_settings = {
+        "shift": 3.0,
+    }
+
+    latent_format = comfy.latent_formats.ACEAudio
+
+    memory_usage_factor = 0.5
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.ACEStep(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.ace.AceT5Tokenizer, comfy.text_encoders.ace.AceT5Model)
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep]

 models += [SVD_img2vid]
--- a/comfy/text_encoders/ace.py
+++ b/comfy/text_encoders/ace.py
@@ -0,0 +1,153 @@
+from comfy import sd1_clip
+from .spiece_tokenizer import SPieceTokenizer
+import comfy.text_encoders.t5
+import os
+import re
+import torch
+import logging
+
+from tokenizers import Tokenizer
+from .ace_text_cleaners import multilingual_cleaners, japanese_to_romaji
+
+SUPPORT_LANGUAGES = {
+    "en": 259, "de": 260, "fr": 262, "es": 284, "it": 285,
+    "pt": 286, "pl": 294, "tr": 295, "ru": 267, "cs": 293,
+    "nl": 297, "ar": 5022, "zh": 5023, "ja": 5412, "hu": 5753,
+    "ko": 6152, "hi": 6680
+}
+
+structure_pattern = re.compile(r"\[.*?\]")
+
+DEFAULT_VOCAB_FILE = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "ace_lyrics_tokenizer"), "vocab.json")
+
+
+class VoiceBpeTokenizer:
+    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
+        self.tokenizer = None
+        if vocab_file is not None:
+            self.tokenizer = Tokenizer.from_file(vocab_file)
+
+    def preprocess_text(self, txt, lang):
+        txt = multilingual_cleaners(txt, lang)
+        return txt
+
+    def encode(self, txt, lang='en'):
+        # lang = lang.split("-")[0]  # remove the region
+        # self.check_input_length(txt, lang)
+        txt = self.preprocess_text(txt, lang)
+        lang = "zh-cn" if lang == "zh" else lang
+        txt = f"[{lang}]{txt}"
+        txt = txt.replace(" ", "[SPACE]")
+        return self.tokenizer.encode(txt).ids
+
+    def get_lang(self, line):
+        if line.startswith("[") and line[3:4] == ']':
+            lang = line[1:3].lower()
+            if lang in SUPPORT_LANGUAGES:
+                return lang, line[4:]
+        return "en", line
+
+    def __call__(self, string):
+        lines = string.split("\n")
+        lyric_token_idx = [261]
+        for line in lines:
+            line = line.strip()
+            if not line:
+                lyric_token_idx += [2]
+                continue
+
+            lang, line = self.get_lang(line)
+
+            if lang not in SUPPORT_LANGUAGES:
+                lang = "en"
+            if "zh" in lang:
+                lang = "zh"
+            if "spa" in lang:
+                lang = "es"
+
+            try:
+                line_out = japanese_to_romaji(line)
+                if line_out != line:
+                    lang = "ja"
+                line = line_out
+            except:
+                pass
+
+            try:
+                if structure_pattern.match(line):
+                    token_idx = self.encode(line, "en")
+                else:
+                    token_idx = self.encode(line, lang)
+                lyric_token_idx = lyric_token_idx + token_idx + [2]
+            except Exception as e:
+                logging.warning("tokenize error {} for line {} major_language {}".format(e, line, lang))
+        return {"input_ids": lyric_token_idx}
+
+    @staticmethod
+    def from_pretrained(path, **kwargs):
+        return VoiceBpeTokenizer(path, **kwargs)
+
+    def get_vocab(self):
+        return {}
+
+
+class UMT5BaseModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "umt5_config_base.json")
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=False, model_options=model_options)
+
+class UMT5BaseTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer = tokenizer_data.get("spiece_model", None)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=768, embedding_key='umt5base', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=0, tokenizer_data=tokenizer_data)
+
+    def state_dict(self):
+        return {"spiece_model": self.tokenizer.serialize_model()}
+
+class LyricsTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "ace_lyrics_tokenizer"), "vocab.json")
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=1024, embedding_key='lyrics', tokenizer_class=VoiceBpeTokenizer, has_start_token=True, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=2, has_end_token=False, tokenizer_data=tokenizer_data)
+
+class AceT5Tokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        self.voicebpe = LyricsTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.umt5base = UMT5BaseTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+        out = {}
+        out["lyrics"] = self.voicebpe.tokenize_with_weights(kwargs.get("lyrics", ""), return_word_ids, **kwargs)
+        out["umt5base"] = self.umt5base.tokenize_with_weights(text, return_word_ids, **kwargs)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.umt5base.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return self.umt5base.state_dict()
+
+class AceT5Model(torch.nn.Module):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+        super().__init__()
+        self.umt5base = UMT5BaseModel(device=device, dtype=dtype, model_options=model_options)
+        self.dtypes = set()
+        if dtype is not None:
+            self.dtypes.add(dtype)
+
+    def set_clip_options(self, options):
+        self.umt5base.set_clip_options(options)
+
+    def reset_clip_options(self):
+        self.umt5base.reset_clip_options()
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_umt5base = token_weight_pairs["umt5base"]
+        token_weight_pairs_lyrics = token_weight_pairs["lyrics"]
+
+        t5_out, t5_pooled = self.umt5base.encode_token_weights(token_weight_pairs_umt5base)
+
+        lyrics_embeds = torch.tensor(list(map(lambda a: a[0], token_weight_pairs_lyrics[0]))).unsqueeze(0)
+        return t5_out, None, {"conditioning_lyrics": lyrics_embeds}
+
+    def load_sd(self, sd):
+        return self.umt5base.load_sd(sd)
--- a/comfy/text_encoders/ace_lyrics_tokenizer/vocab.json
+++ b/comfy/text_encoders/ace_lyrics_tokenizer/vocab.json
--- a/comfy/text_encoders/ace_text_cleaners.py
+++ b/comfy/text_encoders/ace_text_cleaners.py
@@ -0,0 +1,395 @@
+# basic text cleaners for the ACE step model
+# I didn't copy the ones from the reference code because I didn't want to deal with the dependencies
+# TODO: more languages than english?
+
+import re
+
+def japanese_to_romaji(japanese_text):
+    """
+    Convert Japanese hiragana and katakana to romaji (Latin alphabet representation).
+
+    Args:
+        japanese_text (str): Text containing hiragana and/or katakana characters
+
+    Returns:
+        str: The romaji (Latin alphabet) equivalent
+    """
+    # Dictionary mapping kana characters to their romaji equivalents
+    kana_map = {
+        # Katakana characters
+        'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o',
+        'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko',
+        'サ': 'sa', 'シ': 'shi', 'ス': 'su', 'セ': 'se', 'ソ': 'so',
+        'タ': 'ta', 'チ': 'chi', 'ツ': 'tsu', 'テ': 'te', 'ト': 'to',
+        'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no',
+        'ハ': 'ha', 'ヒ': 'hi', 'フ': 'fu', 'ヘ': 'he', 'ホ': 'ho',
+        'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo',
+        'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo',
+        'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro',
+        'ワ': 'wa', 'ヲ': 'wo', 'ン': 'n',
+
+        # Katakana voiced consonants
+        'ガ': 'ga', 'ギ': 'gi', 'グ': 'gu', 'ゲ': 'ge', 'ゴ': 'go',
+        'ザ': 'za', 'ジ': 'ji', 'ズ': 'zu', 'ゼ': 'ze', 'ゾ': 'zo',
+        'ダ': 'da', 'ヂ': 'ji', 'ヅ': 'zu', 'デ': 'de', 'ド': 'do',
+        'バ': 'ba', 'ビ': 'bi', 'ブ': 'bu', 'ベ': 'be', 'ボ': 'bo',
+        'パ': 'pa', 'ピ': 'pi', 'プ': 'pu', 'ペ': 'pe', 'ポ': 'po',
+
+        # Katakana combinations
+        'キャ': 'kya', 'キュ': 'kyu', 'キョ': 'kyo',
+        'シャ': 'sha', 'シュ': 'shu', 'ショ': 'sho',
+        'チャ': 'cha', 'チュ': 'chu', 'チョ': 'cho',
+        'ニャ': 'nya', 'ニュ': 'nyu', 'ニョ': 'nyo',
+        'ヒャ': 'hya', 'ヒュ': 'hyu', 'ヒョ': 'hyo',
+        'ミャ': 'mya', 'ミュ': 'myu', 'ミョ': 'myo',
+        'リャ': 'rya', 'リュ': 'ryu', 'リョ': 'ryo',
+        'ギャ': 'gya', 'ギュ': 'gyu', 'ギョ': 'gyo',
+        'ジャ': 'ja', 'ジュ': 'ju', 'ジョ': 'jo',
+        'ビャ': 'bya', 'ビュ': 'byu', 'ビョ': 'byo',
+        'ピャ': 'pya', 'ピュ': 'pyu', 'ピョ': 'pyo',
+
+        # Katakana small characters and special cases
+        'ッ': '', # Small tsu (doubles the following consonant)
+        'ャ': 'ya', 'ュ': 'yu', 'ョ': 'yo',
+
+        # Katakana extras
+        'ヴ': 'vu', 'ファ': 'fa', 'フィ': 'fi', 'フェ': 'fe', 'フォ': 'fo',
+        'ウィ': 'wi', 'ウェ': 'we', 'ウォ': 'wo',
+
+        # Hiragana characters
+        'あ': 'a', 'い': 'i', 'う': 'u', 'え': 'e', 'お': 'o',
+        'か': 'ka', 'き': 'ki', 'く': 'ku', 'け': 'ke', 'こ': 'ko',
+        'さ': 'sa', 'し': 'shi', 'す': 'su', 'せ': 'se', 'そ': 'so',
+        'た': 'ta', 'ち': 'chi', 'つ': 'tsu', 'て': 'te', 'と': 'to',
+        'な': 'na', 'に': 'ni', 'ぬ': 'nu', 'ね': 'ne', 'の': 'no',
+        'は': 'ha', 'ひ': 'hi', 'ふ': 'fu', 'へ': 'he', 'ほ': 'ho',
+        'ま': 'ma', 'み': 'mi', 'む': 'mu', 'め': 'me', 'も': 'mo',
+        'や': 'ya', 'ゆ': 'yu', 'よ': 'yo',
+        'ら': 'ra', 'り': 'ri', 'る': 'ru', 'れ': 're', 'ろ': 'ro',
+        'わ': 'wa', 'を': 'wo', 'ん': 'n',
+
+        # Hiragana voiced consonants
+        'が': 'ga', 'ぎ': 'gi', 'ぐ': 'gu', 'げ': 'ge', 'ご': 'go',
+        'ざ': 'za', 'じ': 'ji', 'ず': 'zu', 'ぜ': 'ze', 'ぞ': 'zo',
+        'だ': 'da', 'ぢ': 'ji', 'づ': 'zu', 'で': 'de', 'ど': 'do',
+        'ば': 'ba', 'び': 'bi', 'ぶ': 'bu', 'べ': 'be', 'ぼ': 'bo',
+        'ぱ': 'pa', 'ぴ': 'pi', 'ぷ': 'pu', 'ぺ': 'pe', 'ぽ': 'po',
+
+        # Hiragana combinations
+        'きゃ': 'kya', 'きゅ': 'kyu', 'きょ': 'kyo',
+        'しゃ': 'sha', 'しゅ': 'shu', 'しょ': 'sho',
+        'ちゃ': 'cha', 'ちゅ': 'chu', 'ちょ': 'cho',
+        'にゃ': 'nya', 'にゅ': 'nyu', 'にょ': 'nyo',
+        'ひゃ': 'hya', 'ひゅ': 'hyu', 'ひょ': 'hyo',
+        'みゃ': 'mya', 'みゅ': 'myu', 'みょ': 'myo',
+        'りゃ': 'rya', 'りゅ': 'ryu', 'りょ': 'ryo',
+        'ぎゃ': 'gya', 'ぎゅ': 'gyu', 'ぎょ': 'gyo',
+        'じゃ': 'ja', 'じゅ': 'ju', 'じょ': 'jo',
+        'びゃ': 'bya', 'びゅ': 'byu', 'びょ': 'byo',
+        'ぴゃ': 'pya', 'ぴゅ': 'pyu', 'ぴょ': 'pyo',
+
+        # Hiragana small characters and special cases
+        'っ': '', # Small tsu (doubles the following consonant)
+        'ゃ': 'ya', 'ゅ': 'yu', 'ょ': 'yo',
+
+        # Common punctuation and spaces
+        '　': ' ', # Japanese space
+        '、': ', ', '。': '. ',
+    }
+
+    result = []
+    i = 0
+
+    while i < len(japanese_text):
+        # Check for small tsu (doubling the following consonant)
+        if i < len(japanese_text) - 1 and (japanese_text[i] == 'っ' or japanese_text[i] == 'ッ'):
+            if i < len(japanese_text) - 1 and japanese_text[i+1] in kana_map:
+                next_romaji = kana_map[japanese_text[i+1]]
+                if next_romaji and next_romaji[0] not in 'aiueon':
+                    result.append(next_romaji[0])  # Double the consonant
+            i += 1
+            continue
+
+        # Check for combinations with small ya, yu, yo
+        if i < len(japanese_text) - 1 and japanese_text[i+1] in ('ゃ', 'ゅ', 'ょ', 'ャ', 'ュ', 'ョ'):
+            combo = japanese_text[i:i+2]
+            if combo in kana_map:
+                result.append(kana_map[combo])
+                i += 2
+                continue
+
+        # Regular character
+        if japanese_text[i] in kana_map:
+            result.append(kana_map[japanese_text[i]])
+        else:
+            # If it's not in our map, keep it as is (might be kanji, romaji, etc.)
+            result.append(japanese_text[i])
+
+        i += 1
+
+    return ''.join(result)
+
+def number_to_text(num, ordinal=False):
+    """
+    Convert a number (int or float) to its text representation.
+
+    Args:
+        num: The number to convert
+
+    Returns:
+        str: Text representation of the number
+    """
+
+    if not isinstance(num, (int, float)):
+        return "Input must be a number"
+
+    # Handle special case of zero
+    if num == 0:
+        return "zero"
+
+    # Handle negative numbers
+    negative = num < 0
+    num = abs(num)
+
+    # Handle floats
+    if isinstance(num, float):
+        # Split into integer and decimal parts
+        int_part = int(num)
+
+        # Convert both parts
+        int_text = _int_to_text(int_part)
+
+        # Handle decimal part (convert to string and remove '0.')
+        decimal_str = str(num).split('.')[1]
+        decimal_text = " point " + " ".join(_digit_to_text(int(digit)) for digit in decimal_str)
+
+        result = int_text + decimal_text
+    else:
+        # Handle integers
+        result = _int_to_text(num)
+
+    # Add 'negative' prefix for negative numbers
+    if negative:
+        result = "negative " + result
+
+    return result
+
+
+def _int_to_text(num):
+    """Helper function to convert an integer to text"""
+
+    ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
+            "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
+            "seventeen", "eighteen", "nineteen"]
+
+    tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
+
+    if num < 20:
+        return ones[num]
+
+    if num < 100:
+        return tens[num // 10] + (" " + ones[num % 10] if num % 10 != 0 else "")
+
+    if num < 1000:
+        return ones[num // 100] + " hundred" + (" " + _int_to_text(num % 100) if num % 100 != 0 else "")
+
+    if num < 1000000:
+        return _int_to_text(num // 1000) + " thousand" + (" " + _int_to_text(num % 1000) if num % 1000 != 0 else "")
+
+    if num < 1000000000:
+        return _int_to_text(num // 1000000) + " million" + (" " + _int_to_text(num % 1000000) if num % 1000000 != 0 else "")
+
+    return _int_to_text(num // 1000000000) + " billion" + (" " + _int_to_text(num % 1000000000) if num % 1000000000 != 0 else "")
+
+
+def _digit_to_text(digit):
+    """Convert a single digit to text"""
+    digits = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
+    return digits[digit]
+
+
+_whitespace_re = re.compile(r"\s+")
+
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = {
+    "en": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mrs", "misess"),
+            ("mr", "mister"),
+            ("dr", "doctor"),
+            ("st", "saint"),
+            ("co", "company"),
+            ("jr", "junior"),
+            ("maj", "major"),
+            ("gen", "general"),
+            ("drs", "doctors"),
+            ("rev", "reverend"),
+            ("lt", "lieutenant"),
+            ("hon", "honorable"),
+            ("sgt", "sergeant"),
+            ("capt", "captain"),
+            ("esq", "esquire"),
+            ("ltd", "limited"),
+            ("col", "colonel"),
+            ("ft", "fort"),
+        ]
+    ],
+}
+
+
+def expand_abbreviations_multilingual(text, lang="en"):
+    for regex, replacement in _abbreviations[lang]:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+_symbols_multilingual = {
+    "en": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " and "),
+            ("@", " at "),
+            ("%", " percent "),
+            ("#", " hash "),
+            ("$", " dollar "),
+            ("£", " pound "),
+            ("°", " degree "),
+        ]
+    ],
+}
+
+
+def expand_symbols_multilingual(text, lang="en"):
+    for regex, replacement in _symbols_multilingual[lang]:
+        text = re.sub(regex, replacement, text)
+        text = text.replace("  ", " ")  # Ensure there are no double spaces
+    return text.strip()
+
+
+_ordinal_re = {
+    "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
+}
+_number_re = re.compile(r"[0-9]+")
+_currency_re = {
+    "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
+    "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
+    "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
+}
+
+_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
+_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
+_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
+
+
+def _remove_commas(m):
+    text = m.group(0)
+    if "," in text:
+        text = text.replace(",", "")
+    return text
+
+
+def _remove_dots(m):
+    text = m.group(0)
+    if "." in text:
+        text = text.replace(".", "")
+    return text
+
+
+def _expand_decimal_point(m, lang="en"):
+    amount = m.group(1).replace(",", ".")
+    return number_to_text(float(amount))
+
+
+def _expand_currency(m, lang="en", currency="USD"):
+    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
+    full_amount = number_to_text(amount)
+
+    and_equivalents = {
+        "en": ", ",
+        "es": " con ",
+        "fr": " et ",
+        "de": " und ",
+        "pt": " e ",
+        "it": " e ",
+        "pl": ", ",
+        "cs": ", ",
+        "ru": ", ",
+        "nl": ", ",
+        "ar": ", ",
+        "tr": ", ",
+        "hu": ", ",
+        "ko": ", ",
+    }
+
+    if amount.is_integer():
+        last_and = full_amount.rfind(and_equivalents[lang])
+        if last_and != -1:
+            full_amount = full_amount[:last_and]
+
+    return full_amount
+
+
+def _expand_ordinal(m, lang="en"):
+    return number_to_text(int(m.group(1)), ordinal=True)
+
+
+def _expand_number(m, lang="en"):
+    return number_to_text(int(m.group(0)))
+
+
+def expand_numbers_multilingual(text, lang="en"):
+    if lang in ["en", "ru"]:
+        text = re.sub(_comma_number_re, _remove_commas, text)
+    else:
+        text = re.sub(_dot_number_re, _remove_dots, text)
+    try:
+        text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
+        text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
+        text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
+    except:
+        pass
+
+    text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
+    text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
+    text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
+    return text
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+
+
+def multilingual_cleaners(text, lang):
+    text = text.replace('"', "")
+    if lang == "tr":
+        text = text.replace("İ", "i")
+        text = text.replace("Ö", "ö")
+        text = text.replace("Ü", "ü")
+    text = lowercase(text)
+    try:
+        text = expand_numbers_multilingual(text, lang)
+    except:
+        pass
+    try:
+        text = expand_abbreviations_multilingual(text, lang)
+    except:
+        pass
+    try:
+        text = expand_symbols_multilingual(text, lang=lang)
+    except:
+        pass
+    text = collapse_whitespace(text)
+    return text
+
+
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
--- a/comfy/text_encoders/umt5_config_base.json
+++ b/comfy/text_encoders/umt5_config_base.json
@@ -0,0 +1,22 @@
+{
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "dense_act_fn": "gelu_pytorch_tanh",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "umt5",
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "vocab_size": 256384
+}
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -28,6 +28,9 @@ import logging
 import itertools
 from torch.nn.functional import interpolate
 from einops import rearrange
+from comfy.cli_args import args
+
+MMAP_TORCH_FILES = args.mmap_torch_files

 ALWAYS_SAFE_LOAD = False
 if hasattr(torch.serialization, "add_safe_globals"):  # TODO: this was added in pytorch 2.4, the unsafe path should be removed once earlier versions are deprecated
@@ -67,8 +70,12 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
                    raise ValueError("{}\n\nFile path: {}\n\nThe safetensors file is corrupt/incomplete. Check the file size and make sure you have copied/downloaded it correctly.".format(message, ckpt))
            raise e
    else:
+        torch_args = {}
+        if MMAP_TORCH_FILES:
+            torch_args["mmap"] = True
+
        if safe_load or ALWAYS_SAFE_LOAD:
-            pl_sd = torch.load(ckpt, map_location=device, weights_only=True)
+            pl_sd = torch.load(ckpt, map_location=device, weights_only=True, **torch_args)
        else:
            pl_sd = torch.load(ckpt, map_location=device, pickle_module=comfy.checkpoint_pickle)
        if "global_step" in pl_sd:
--- a/comfy_api_nodes/apinode_utils.py
+++ b/comfy_api_nodes/apinode_utils.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 import io
 import logging
 from typing import Optional
@@ -314,7 +315,7 @@ def upload_file_to_comfyapi(
    file_bytes_io: BytesIO,
    filename: str,
    upload_mime_type: str,
-    auth_token: Optional[str] = None,
+    auth_kwargs: Optional[dict[str,str]] = None,
 ) -> str:
    """
    Uploads a single file to ComfyUI API and returns its download URL.
@@ -323,7 +324,7 @@ def upload_file_to_comfyapi(
        file_bytes_io: BytesIO object containing the file data.
        filename: The filename of the file.
        upload_mime_type: MIME type of the file.
-        auth_token: Optional authentication token.
+        auth_kwargs: Optional authentication token(s).

    Returns:
        The download URL for the uploaded file.
@@ -337,7 +338,7 @@ def upload_file_to_comfyapi(
            response_model=UploadResponse,
        ),
        request=request_object,
-        auth_token=auth_token,
+        auth_kwargs=auth_kwargs,
    )

    response: UploadResponse = operation.execute()
@@ -351,7 +352,7 @@ def upload_file_to_comfyapi(

 def upload_video_to_comfyapi(
    video: VideoInput,
-    auth_token: Optional[str] = None,
+    auth_kwargs: Optional[dict[str,str]] = None,
    container: VideoContainer = VideoContainer.MP4,
    codec: VideoCodec = VideoCodec.H264,
    max_duration: Optional[int] = None,
@@ -362,7 +363,7 @@ def upload_video_to_comfyapi(

    Args:
        video: VideoInput object (Comfy VIDEO type).
-        auth_token: Optional authentication token.
+        auth_kwargs: Optional authentication token(s).
        container: The video container format to use (default: MP4).
        codec: The video codec to use (default: H264).
        max_duration: Optional maximum duration of the video in seconds. If the video is longer than this, an error will be raised.
@@ -390,7 +391,7 @@ def upload_video_to_comfyapi(
    video_bytes_io.seek(0)

    return upload_file_to_comfyapi(
-        video_bytes_io, filename, upload_mime_type, auth_token
+        video_bytes_io, filename, upload_mime_type, auth_kwargs
    )


@@ -453,7 +454,7 @@ def audio_ndarray_to_bytesio(

 def upload_audio_to_comfyapi(
    audio: AudioInput,
-    auth_token: Optional[str] = None,
+    auth_kwargs: Optional[dict[str,str]] = None,
    container_format: str = "mp4",
    codec_name: str = "aac",
    mime_type: str = "audio/mp4",
@@ -465,7 +466,7 @@ def upload_audio_to_comfyapi(

    Args:
        audio: a Comfy `AUDIO` type (contains waveform tensor and sample_rate)
-        auth_token: Optional authentication token.
+        auth_kwargs: Optional authentication token(s).

    Returns:
        The download URL for the uploaded audio file.
@@ -477,11 +478,11 @@ def upload_audio_to_comfyapi(
        audio_data_np, sample_rate, container_format, codec_name
    )

-    return upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_token)
+    return upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs)


 def upload_images_to_comfyapi(
-    image: torch.Tensor, max_images=8, auth_token=None, mime_type: Optional[str] = None
+    image: torch.Tensor, max_images=8, auth_kwargs: Optional[dict[str,str]] = None, mime_type: Optional[str] = None
 ) -> list[str]:
    """
    Uploads images to ComfyUI API and returns download URLs.
@@ -490,7 +491,7 @@ def upload_images_to_comfyapi(
    Args:
        image: Input torch.Tensor image.
        max_images: Maximum number of images to upload.
-        auth_token: Optional authentication token.
+        auth_kwargs: Optional authentication token(s).
        mime_type: Optional MIME type for the image.
    """
    # if batch, try to upload each file if max_images is greater than 0
@@ -521,7 +522,7 @@ def upload_images_to_comfyapi(
                response_model=UploadResponse,
            ),
            request=request_object,
-            auth_token=auth_token,
+            auth_kwargs=auth_kwargs,
        )
        response = operation.execute()

--- a/comfy_api_nodes/apis/client.py
+++ b/comfy_api_nodes/apis/client.py
@@ -20,7 +20,8 @@ Usage Examples:
 # 1. Create the API client
 api_client = ApiClient(
    base_url="https://api.example.com",
-    api_key="your_api_key_here",
+    auth_token="your_auth_token_here",
+    comfy_api_key="your_comfy_api_key_here",
    timeout=30.0,
    verify_ssl=True
 )
@@ -146,12 +147,14 @@ class ApiClient:
    def __init__(
        self,
        base_url: str,
-        api_key: Optional[str] = None,
+        auth_token: Optional[str] = None,
+        comfy_api_key: Optional[str] = None,
        timeout: float = 3600.0,
        verify_ssl: bool = True,
    ):
        self.base_url = base_url
-        self.api_key = api_key
+        self.auth_token = auth_token
+        self.comfy_api_key = comfy_api_key
        self.timeout = timeout
        self.verify_ssl = verify_ssl

@@ -201,8 +204,10 @@ class ApiClient:
        """Get headers for API requests, including authentication if available"""
        headers = {"Content-Type": "application/json", "Accept": "application/json"}

-        if self.api_key:
-            headers["Authorization"] = f"Bearer {self.api_key}"
+        if self.auth_token:
+            headers["Authorization"] = f"Bearer {self.auth_token}"
+        elif self.comfy_api_key:
+            headers["X-API-KEY"] = self.comfy_api_key

        return headers

@@ -236,7 +241,7 @@ class ApiClient:
            requests.RequestException: If the request fails
        """
        url = urljoin(self.base_url, path)
-        self.check_auth_token(self.api_key)
+        self.check_auth(self.auth_token, self.comfy_api_key)
        # Combine default headers with any provided headers
        request_headers = self.get_headers()
        if headers:
@@ -320,11 +325,11 @@ class ApiClient:
            return response.json()
        return {}

-    def check_auth_token(self, auth_token):
-        """Verify that an auth token is present."""
-        if auth_token is None:
+    def check_auth(self, auth_token, comfy_api_key):
+        """Verify that an auth token is present or comfy_api_key is present"""
+        if auth_token is None and comfy_api_key is None:
            raise Exception("Unauthorized: Please login first to use this node.")
-        return auth_token
+        return auth_token or comfy_api_key

    @staticmethod
    def upload_file(
@@ -392,6 +397,8 @@ class SynchronousOperation(Generic[T, R]):
        files: Optional[Dict[str, Any]] = None,
        api_base: str | None = None,
        auth_token: Optional[str] = None,
+        comfy_api_key: Optional[str] = None,
+        auth_kwargs: Optional[Dict[str,str]] = None,
        timeout: float = 604800.0,
        verify_ssl: bool = True,
        content_type: str = "application/json",
@@ -403,6 +410,10 @@ class SynchronousOperation(Generic[T, R]):
        self.error = None
        self.api_base: str = api_base or args.comfy_api_base
        self.auth_token = auth_token
+        self.comfy_api_key = comfy_api_key
+        if auth_kwargs is not None:
+            self.auth_token = auth_kwargs.get("auth_token", self.auth_token)
+            self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key)
        self.timeout = timeout
        self.verify_ssl = verify_ssl
        self.files = files
@@ -415,7 +426,8 @@ class SynchronousOperation(Generic[T, R]):
            if client is None:
                client = ApiClient(
                    base_url=self.api_base,
-                    api_key=self.auth_token,
+                    auth_token=self.auth_token,
+                    comfy_api_key=self.comfy_api_key,
                    timeout=self.timeout,
                    verify_ssl=self.verify_ssl,
                )
@@ -502,12 +514,18 @@ class PollingOperation(Generic[T, R]):
        request: Optional[T] = None,
        api_base: str | None = None,
        auth_token: Optional[str] = None,
+        comfy_api_key: Optional[str] = None,
+        auth_kwargs: Optional[Dict[str,str]] = None,
        poll_interval: float = 5.0,
    ):
        self.poll_endpoint = poll_endpoint
        self.request = request
        self.api_base: str = api_base or args.comfy_api_base
        self.auth_token = auth_token
+        self.comfy_api_key = comfy_api_key
+        if auth_kwargs is not None:
+            self.auth_token = auth_kwargs.get("auth_token", self.auth_token)
+            self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key)
        self.poll_interval = poll_interval

        # Polling configuration
@@ -528,7 +546,8 @@ class PollingOperation(Generic[T, R]):
            if client is None:
                client = ApiClient(
                    base_url=self.api_base,
-                    api_key=self.auth_token,
+                    auth_token=self.auth_token,
+                    comfy_api_key=self.comfy_api_key,
                )
            return self._poll_until_complete(client)
        except Exception as e:
--- a/comfy_api_nodes/apis/recraft_api.py
+++ b/comfy_api_nodes/apis/recraft_api.py
@@ -81,7 +81,6 @@ class RecraftStyle:

 class RecraftIO:
    STYLEV3 = "RECRAFT_V3_STYLE"
-    SVG = "SVG" # TODO: if acceptable, move into ComfyUI's typing class
    COLOR = "RECRAFT_COLOR"
    CONTROLS = "RECRAFT_CONTROLS"

--- a/comfy_api_nodes/canary.py
+++ b/comfy_api_nodes/canary.py
@@ -0,0 +1,10 @@
+import av
+
+ver = av.__version__.split(".")
+if int(ver[0]) < 14:
+    raise Exception("INSTALL NEW VERSION OF PYAV TO USE API NODES.")
+
+if int(ver[0]) == 14 and int(ver[1]) < 2:
+    raise Exception("INSTALL NEW VERSION OF PYAV TO USE API NODES.")
+
+NODE_CLASS_MAPPINGS = {}
--- a/comfy_api_nodes/nodes_bfl.py
+++ b/comfy_api_nodes/nodes_bfl.py
@@ -179,6 +179,7 @@ class FluxProUltraImageNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -211,7 +212,6 @@ class FluxProUltraImageNode(ComfyNodeABC):
        seed=0,
        image_prompt=None,
        image_prompt_strength=0.1,
-        auth_token=None,
        **kwargs,
    ):
        if image_prompt is None:
@@ -244,7 +244,7 @@ class FluxProUltraImageNode(ComfyNodeABC):
                    None if image_prompt is None else round(image_prompt_strength, 2)
                ),
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        output_image = handle_bfl_synchronous_operation(operation)
        return (output_image,)
@@ -319,6 +319,7 @@ class FluxProImageNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -337,7 +338,6 @@ class FluxProImageNode(ComfyNodeABC):
        seed=0,
        image_prompt=None,
        # image_prompt_strength=0.1,
-        auth_token=None,
        **kwargs,
    ):
        image_prompt = (
@@ -361,7 +361,7 @@ class FluxProImageNode(ComfyNodeABC):
                seed=seed,
                image_prompt=image_prompt,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        output_image = handle_bfl_synchronous_operation(operation)
        return (output_image,)
@@ -461,6 +461,7 @@ class FluxProExpandNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -482,7 +483,6 @@ class FluxProExpandNode(ComfyNodeABC):
        steps: int,
        guidance: float,
        seed=0,
-        auth_token=None,
        **kwargs,
    ):
        image = convert_image_to_base64(image)
@@ -506,7 +506,7 @@ class FluxProExpandNode(ComfyNodeABC):
                seed=seed,
                image=image,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        output_image = handle_bfl_synchronous_operation(operation)
        return (output_image,)
@@ -572,6 +572,7 @@ class FluxProFillNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -590,7 +591,6 @@ class FluxProFillNode(ComfyNodeABC):
        steps: int,
        guidance: float,
        seed=0,
-        auth_token=None,
        **kwargs,
    ):
        # prepare mask
@@ -615,7 +615,7 @@ class FluxProFillNode(ComfyNodeABC):
                image=image,
                mask=mask,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        output_image = handle_bfl_synchronous_operation(operation)
        return (output_image,)
@@ -706,6 +706,7 @@ class FluxProCannyNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -726,7 +727,6 @@ class FluxProCannyNode(ComfyNodeABC):
        steps: int,
        guidance: float,
        seed=0,
-        auth_token=None,
        **kwargs,
    ):
        control_image = convert_image_to_base64(control_image[:,:,:,:3])
@@ -763,7 +763,7 @@ class FluxProCannyNode(ComfyNodeABC):
                canny_high_threshold=canny_high_threshold,
                preprocessed_image=preprocessed_image,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        output_image = handle_bfl_synchronous_operation(operation)
        return (output_image,)
@@ -834,6 +834,7 @@ class FluxProDepthNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -852,7 +853,6 @@ class FluxProDepthNode(ComfyNodeABC):
        steps: int,
        guidance: float,
        seed=0,
-        auth_token=None,
        **kwargs,
    ):
        control_image = convert_image_to_base64(control_image[:,:,:,:3])
@@ -878,7 +878,7 @@ class FluxProDepthNode(ComfyNodeABC):
                control_image=control_image,
                preprocessed_image=preprocessed_image,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        output_image = handle_bfl_synchronous_operation(operation)
        return (output_image,)
--- a/comfy_api_nodes/nodes_ideogram.py
+++ b/comfy_api_nodes/nodes_ideogram.py
@@ -234,9 +234,7 @@ def download_and_process_images(image_urls):

 class IdeogramV1(ComfyNodeABC):
    """
-    Generates images synchronously using the Ideogram V1 model.
-
-    Images links are available for a limited period of time; if you would like to keep the image, you must download it.
+    Generates images using the Ideogram V1 model.
    """

    def __init__(self):
@@ -303,7 +301,10 @@ class IdeogramV1(ComfyNodeABC):
                    {"default": 1, "min": 1, "max": 8, "step": 1, "display": "number"},
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = (IO.IMAGE,)
@@ -321,7 +322,7 @@ class IdeogramV1(ComfyNodeABC):
        seed=0,
        negative_prompt="",
        num_images=1,
-        auth_token=None,
+        **kwargs,
    ):
        # Determine the model based on turbo setting
        aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None)
@@ -347,7 +348,7 @@ class IdeogramV1(ComfyNodeABC):
                    negative_prompt=negative_prompt if negative_prompt else None,
                )
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        response = operation.execute()
@@ -365,9 +366,7 @@ class IdeogramV1(ComfyNodeABC):

 class IdeogramV2(ComfyNodeABC):
    """
-    Generates images synchronously using the Ideogram V2 model.
-
-    Images links are available for a limited period of time; if you would like to keep the image, you must download it.
+    Generates images using the Ideogram V2 model.
    """

    def __init__(self):
@@ -458,7 +457,10 @@ class IdeogramV2(ComfyNodeABC):
                #    },
                #),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = (IO.IMAGE,)
@@ -479,7 +481,7 @@ class IdeogramV2(ComfyNodeABC):
        negative_prompt="",
        num_images=1,
        color_palette="",
-        auth_token=None,
+        **kwargs,
    ):
        aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None)
        resolution = V1_V1_RES_MAP.get(resolution, None)
@@ -519,7 +521,7 @@ class IdeogramV2(ComfyNodeABC):
                    color_palette=color_palette if color_palette else None,
                )
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        response = operation.execute()
@@ -536,10 +538,7 @@ class IdeogramV2(ComfyNodeABC):

 class IdeogramV3(ComfyNodeABC):
    """
-    Generates images synchronously using the Ideogram V3 model.
-
-    Supports both regular image generation from text prompts and image editing with mask.
-    Images links are available for a limited period of time; if you would like to keep the image, you must download it.
+    Generates images using the Ideogram V3 model. Supports both regular image generation from text prompts and image editing with mask.
    """

    def __init__(self):
@@ -621,7 +620,10 @@ class IdeogramV3(ComfyNodeABC):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = (IO.IMAGE,)
@@ -641,7 +643,7 @@ class IdeogramV3(ComfyNodeABC):
        seed=0,
        num_images=1,
        rendering_speed="BALANCED",
-        auth_token=None,
+        **kwargs,
    ):
        # Check if both image and mask are provided for editing mode
        if image is not None and mask is not None:
@@ -705,7 +707,7 @@ class IdeogramV3(ComfyNodeABC):
                    "mask": mask_binary,
                },
                content_type="multipart/form-data",
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )

        elif image is not None or mask is not None:
@@ -746,7 +748,7 @@ class IdeogramV3(ComfyNodeABC):
                    response_model=IdeogramGenerateResponse,
                ),
                request=gen_request,
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )

        # Execute the operation and process response
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@@ -95,7 +95,7 @@ class KlingApiError(Exception):
    pass


-def poll_until_finished(auth_token: str, api_endpoint: ApiEndpoint[Any, R]) -> R:
+def poll_until_finished(auth_kwargs: dict[str,str], api_endpoint: ApiEndpoint[Any, R]) -> R:
    """Polls the Kling API endpoint until the task reaches a terminal state, then returns the response."""
    return PollingOperation(
        poll_endpoint=api_endpoint,
@@ -108,7 +108,7 @@ def poll_until_finished(auth_token: str, api_endpoint: ApiEndpoint[Any, R]) -> R
            if response.data and response.data.task_status
            else None
        ),
-        auth_token=auth_token,
+        auth_kwargs=auth_kwargs,
    ).execute()


@@ -184,6 +184,33 @@ def validate_image_result_response(response) -> None:
        raise KlingApiError(error_msg)


+def validate_input_image(image: torch.Tensor) -> None:
+    """
+    Validates the input image adheres to the expectations of the Kling API:
+    - The image resolution should not be less than 300*300px
+    - The aspect ratio of the image should be between 1:2.5 ~ 2.5:1
+
+    See: https://app.klingai.com/global/dev/document-api/apiReference/model/imageToVideo
+    """
+    if len(image.shape) == 4:
+        height, width = image.shape[1], image.shape[2]
+    elif len(image.shape) == 3:
+        height, width = image.shape[0], image.shape[1]
+    else:
+        raise ValueError("Invalid image tensor shape.")
+
+    # Ensure minimum resolution is met
+    if height < 300:
+        raise ValueError("Image height must be at least 300px")
+    if width < 300:
+        raise ValueError("Image width must be at least 300px")
+
+    # Ensure aspect ratio is within acceptable range
+    aspect_ratio = width / height
+    if aspect_ratio < 1 / 2.5 or aspect_ratio > 2.5:
+        raise ValueError("Image aspect ratio must be between 1:2.5 and 2.5:1")
+
+
 def get_camera_control_input_config(
    tooltip: str, default: float = 0.0
 ) -> tuple[IO, InputTypeOptions]:
@@ -391,16 +418,19 @@ class KlingTextToVideoNode(KlingNodeBase):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = ("VIDEO", "STRING", "STRING")
    RETURN_NAMES = ("VIDEO", "video_id", "duration")
    DESCRIPTION = "Kling Text to Video Node"

-    def get_response(self, task_id: str, auth_token: str) -> KlingText2VideoResponse:
+    def get_response(self, task_id: str, auth_kwargs: dict[str,str]) -> KlingText2VideoResponse:
        return poll_until_finished(
-            auth_token,
+            auth_kwargs,
            ApiEndpoint(
                path=f"{PATH_TEXT_TO_VIDEO}/{task_id}",
                method=HttpMethod.GET,
@@ -419,7 +449,7 @@ class KlingTextToVideoNode(KlingNodeBase):
        camera_control: Optional[KlingCameraControl] = None,
        model_name: Optional[str] = None,
        duration: Optional[str] = None,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile, str, str]:
        validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V)
        if model_name is None:
@@ -441,14 +471,14 @@ class KlingTextToVideoNode(KlingNodeBase):
                aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio),
                camera_control=camera_control,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        task_creation_response = initial_operation.execute()
        validate_task_creation_response(task_creation_response)

        task_id = task_creation_response.data.task_id
-        final_response = self.get_response(task_id, auth_token)
+        final_response = self.get_response(task_id, auth_kwargs=kwargs)
        validate_video_result_response(final_response)

        video = get_video_from_response(final_response)
@@ -495,7 +525,10 @@ class KlingCameraControlT2VNode(KlingTextToVideoNode):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text."
@@ -507,7 +540,7 @@ class KlingCameraControlT2VNode(KlingTextToVideoNode):
        cfg_scale: float,
        aspect_ratio: str,
        camera_control: Optional[KlingCameraControl] = None,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        return super().api_call(
            model_name=KlingVideoGenModelName.kling_v1,
@@ -518,7 +551,7 @@ class KlingCameraControlT2VNode(KlingTextToVideoNode):
            prompt=prompt,
            negative_prompt=negative_prompt,
            camera_control=camera_control,
-            auth_token=auth_token,
+            **kwargs,
        )


@@ -530,7 +563,10 @@ class KlingImage2VideoNode(KlingNodeBase):
        return {
            "required": {
                "start_frame": model_field_to_node_input(
-                    IO.IMAGE, KlingImage2VideoRequest, "image"
+                    IO.IMAGE,
+                    KlingImage2VideoRequest,
+                    "image",
+                    tooltip="The reference image used to generate the video.",
                ),
                "prompt": model_field_to_node_input(
                    IO.STRING, KlingImage2VideoRequest, "prompt", multiline=True
@@ -574,16 +610,19 @@ class KlingImage2VideoNode(KlingNodeBase):
                    enum_type=KlingVideoGenDuration,
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = ("VIDEO", "STRING", "STRING")
    RETURN_NAMES = ("VIDEO", "video_id", "duration")
    DESCRIPTION = "Kling Image to Video Node"

-    def get_response(self, task_id: str, auth_token: str) -> KlingImage2VideoResponse:
+    def get_response(self, task_id: str, auth_kwargs: dict[str,str]) -> KlingImage2VideoResponse:
        return poll_until_finished(
-            auth_token,
+            auth_kwargs,
            ApiEndpoint(
                path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}",
                method=HttpMethod.GET,
@@ -604,12 +643,13 @@ class KlingImage2VideoNode(KlingNodeBase):
        duration: str,
        camera_control: Optional[KlingCameraControl] = None,
        end_frame: Optional[torch.Tensor] = None,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile]:
        validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V)
+        validate_input_image(start_frame)

        if camera_control is not None:
-            # Camera control type for image 2 video is always simple
+            # Camera control type for image 2 video is always `simple`
            camera_control.type = KlingCameraControlType.simple

        initial_operation = SynchronousOperation(
@@ -631,18 +671,17 @@ class KlingImage2VideoNode(KlingNodeBase):
                negative_prompt=negative_prompt if negative_prompt else None,
                cfg_scale=cfg_scale,
                mode=KlingVideoGenMode(mode),
-                aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio),
                duration=KlingVideoGenDuration(duration),
                camera_control=camera_control,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        task_creation_response = initial_operation.execute()
        validate_task_creation_response(task_creation_response)
        task_id = task_creation_response.data.task_id

-        final_response = self.get_response(task_id, auth_token)
+        final_response = self.get_response(task_id, auth_kwargs=kwargs)
        validate_video_result_response(final_response)

        video = get_video_from_response(final_response)
@@ -692,7 +731,10 @@ class KlingCameraControlI2VNode(KlingImage2VideoNode):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image."
@@ -705,7 +747,7 @@ class KlingCameraControlI2VNode(KlingImage2VideoNode):
        cfg_scale: float,
        aspect_ratio: str,
        camera_control: KlingCameraControl,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        return super().api_call(
            model_name=KlingVideoGenModelName.kling_v1_5,
@@ -717,7 +759,7 @@ class KlingCameraControlI2VNode(KlingImage2VideoNode):
            prompt=prompt,
            negative_prompt=negative_prompt,
            camera_control=camera_control,
-            auth_token=auth_token,
+            **kwargs,
        )


@@ -785,7 +827,10 @@ class KlingStartEndFrameNode(KlingImage2VideoNode):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last."
@@ -799,7 +844,7 @@ class KlingStartEndFrameNode(KlingImage2VideoNode):
        cfg_scale: float,
        aspect_ratio: str,
        mode: str,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        mode, duration, model_name = KlingStartEndFrameNode.get_mode_string_mapping()[
            mode
@@ -814,7 +859,7 @@ class KlingStartEndFrameNode(KlingImage2VideoNode):
            aspect_ratio=aspect_ratio,
            duration=duration,
            end_frame=end_frame,
-            auth_token=auth_token,
+            **kwargs,
        )


@@ -844,16 +889,19 @@ class KlingVideoExtendNode(KlingNodeBase):
                    IO.STRING, KlingVideoExtendRequest, "video_id", forceInput=True
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = ("VIDEO", "STRING", "STRING")
    RETURN_NAMES = ("VIDEO", "video_id", "duration")
    DESCRIPTION = "Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes."

-    def get_response(self, task_id: str, auth_token: str) -> KlingVideoExtendResponse:
+    def get_response(self, task_id: str, auth_kwargs: dict[str,str]) -> KlingVideoExtendResponse:
        return poll_until_finished(
-            auth_token,
+            auth_kwargs,
            ApiEndpoint(
                path=f"{PATH_VIDEO_EXTEND}/{task_id}",
                method=HttpMethod.GET,
@@ -868,7 +916,7 @@ class KlingVideoExtendNode(KlingNodeBase):
        negative_prompt: str,
        cfg_scale: float,
        video_id: str,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile, str, str]:
        validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V)
        initial_operation = SynchronousOperation(
@@ -884,14 +932,14 @@ class KlingVideoExtendNode(KlingNodeBase):
                cfg_scale=cfg_scale,
                video_id=video_id,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        task_creation_response = initial_operation.execute()
        validate_task_creation_response(task_creation_response)
        task_id = task_creation_response.data.task_id

-        final_response = self.get_response(task_id, auth_token)
+        final_response = self.get_response(task_id, auth_kwargs=kwargs)
        validate_video_result_response(final_response)

        video = get_video_from_response(final_response)
@@ -904,9 +952,9 @@ class KlingVideoEffectsBase(KlingNodeBase):
    RETURN_TYPES = ("VIDEO", "STRING", "STRING")
    RETURN_NAMES = ("VIDEO", "video_id", "duration")

-    def get_response(self, task_id: str, auth_token: str) -> KlingVideoEffectsResponse:
+    def get_response(self, task_id: str, auth_kwargs: dict[str,str]) -> KlingVideoEffectsResponse:
        return poll_until_finished(
-            auth_token,
+            auth_kwargs,
            ApiEndpoint(
                path=f"{PATH_VIDEO_EFFECTS}/{task_id}",
                method=HttpMethod.GET,
@@ -924,7 +972,7 @@ class KlingVideoEffectsBase(KlingNodeBase):
        image_1: torch.Tensor,
        image_2: Optional[torch.Tensor] = None,
        mode: Optional[KlingVideoGenMode] = None,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        if dual_character:
            request_input_field = KlingDualCharacterEffectInput(
@@ -954,14 +1002,14 @@ class KlingVideoEffectsBase(KlingNodeBase):
                effect_scene=effect_scene,
                input=request_input_field,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        task_creation_response = initial_operation.execute()
        validate_task_creation_response(task_creation_response)
        task_id = task_creation_response.data.task_id

-        final_response = self.get_response(task_id, auth_token)
+        final_response = self.get_response(task_id, auth_kwargs=kwargs)
        validate_video_result_response(final_response)

        video = get_video_from_response(final_response)
@@ -1002,7 +1050,10 @@ class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase):
                    enum_type=KlingVideoGenDuration,
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite."
@@ -1017,7 +1068,7 @@ class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase):
        model_name: KlingCharacterEffectModelName,
        mode: KlingVideoGenMode,
        duration: KlingVideoGenDuration,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        video, _, duration = super().api_call(
            dual_character=True,
@@ -1027,7 +1078,7 @@ class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase):
            duration=duration,
            image_1=image_left,
            image_2=image_right,
-            auth_token=auth_token,
+            **kwargs,
        )
        return video, duration

@@ -1063,7 +1114,10 @@ class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase):
                    enum_type=KlingVideoGenDuration,
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene."
@@ -1074,7 +1128,7 @@ class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase):
        effect_scene: KlingSingleImageEffectsScene,
        model_name: KlingSingleImageEffectModelName,
        duration: KlingVideoGenDuration,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        return super().api_call(
            dual_character=False,
@@ -1082,7 +1136,7 @@ class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase):
            model_name=model_name,
            duration=duration,
            image_1=image,
-            auth_token=auth_token,
+            **kwargs,
        )


@@ -1100,10 +1154,10 @@ class KlingLipSyncBase(KlingNodeBase):
                f"Text is too long. Maximum length is {MAX_PROMPT_LENGTH_LIP_SYNC} characters."
            )

-    def get_response(self, task_id: str, auth_token: str) -> KlingLipSyncResponse:
+    def get_response(self, task_id: str, auth_kwargs: dict[str,str]) -> KlingLipSyncResponse:
        """Polls the Kling API endpoint until the task reaches a terminal state."""
        return poll_until_finished(
-            auth_token,
+            auth_kwargs,
            ApiEndpoint(
                path=f"{PATH_LIP_SYNC}/{task_id}",
                method=HttpMethod.GET,
@@ -1121,18 +1175,18 @@ class KlingLipSyncBase(KlingNodeBase):
        text: Optional[str] = None,
        voice_speed: Optional[float] = None,
        voice_id: Optional[str] = None,
-        auth_token: Optional[str] = None,
+        **kwargs
    ) -> tuple[VideoFromFile, str, str]:
        if text:
            self.validate_text(text)

        # Upload video to Comfy API and get download URL
-        video_url = upload_video_to_comfyapi(video, auth_token)
+        video_url = upload_video_to_comfyapi(video, auth_kwargs=kwargs)
        logging.info("Uploaded video to Comfy API. URL: %s", video_url)

        # Upload the audio file to Comfy API and get download URL
        if audio:
-            audio_url = upload_audio_to_comfyapi(audio, auth_token)
+            audio_url = upload_audio_to_comfyapi(audio, auth_kwargs=kwargs)
            logging.info("Uploaded audio to Comfy API. URL: %s", audio_url)
        else:
            audio_url = None
@@ -1156,14 +1210,14 @@ class KlingLipSyncBase(KlingNodeBase):
                    voice_id=voice_id,
                ),
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        task_creation_response = initial_operation.execute()
        validate_task_creation_response(task_creation_response)
        task_id = task_creation_response.data.task_id

-        final_response = self.get_response(task_id, auth_token)
+        final_response = self.get_response(task_id, auth_kwargs=kwargs)
        validate_video_result_response(final_response)

        video = get_video_from_response(final_response)
@@ -1186,7 +1240,10 @@ class KlingLipSyncAudioToVideoNode(KlingLipSyncBase):
                    enum_type=KlingLipSyncVoiceLanguage,
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file."
@@ -1196,14 +1253,14 @@ class KlingLipSyncAudioToVideoNode(KlingLipSyncBase):
        video: VideoInput,
        audio: AudioInput,
        voice_language: str,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        return super().api_call(
            video=video,
            audio=audio,
            voice_language=voice_language,
            mode="audio2video",
-            auth_token=auth_token,
+            **kwargs,
        )


@@ -1292,7 +1349,10 @@ class KlingLipSyncTextToVideoNode(KlingLipSyncBase):
                    IO.FLOAT, KlingLipSyncInputObject, "voice_speed", slider=True
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt."
@@ -1303,7 +1363,7 @@ class KlingLipSyncTextToVideoNode(KlingLipSyncBase):
        text: str,
        voice: str,
        voice_speed: float,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        voice_id, voice_language = KlingLipSyncTextToVideoNode.get_voice_config()[voice]
        return super().api_call(
@@ -1313,7 +1373,7 @@ class KlingLipSyncTextToVideoNode(KlingLipSyncBase):
            voice_id=voice_id,
            voice_speed=voice_speed,
            mode="text2video",
-            auth_token=auth_token,
+            **kwargs,
        )


@@ -1350,16 +1410,19 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase):
                    enum_type=KlingVirtualTryOnModelName,
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human."

    def get_response(
-        self, task_id: str, auth_token: Optional[str] = None
+        self, task_id: str, auth_kwargs: dict[str,str] = None
    ) -> KlingVirtualTryOnResponse:
        return poll_until_finished(
-            auth_token,
+            auth_kwargs,
            ApiEndpoint(
                path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}",
                method=HttpMethod.GET,
@@ -1373,7 +1436,7 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase):
        human_image: torch.Tensor,
        cloth_image: torch.Tensor,
        model_name: KlingVirtualTryOnModelName,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        initial_operation = SynchronousOperation(
            endpoint=ApiEndpoint(
@@ -1387,14 +1450,14 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase):
                cloth_image=tensor_to_base64_string(cloth_image),
                model_name=model_name,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        task_creation_response = initial_operation.execute()
        validate_task_creation_response(task_creation_response)
        task_id = task_creation_response.data.task_id

-        final_response = self.get_response(task_id, auth_token)
+        final_response = self.get_response(task_id, auth_kwargs=kwargs)
        validate_image_result_response(final_response)

        images = get_images_from_response(final_response)
@@ -1462,16 +1525,19 @@ class KlingImageGenerationNode(KlingImageGenerationBase):
            "optional": {
                "image": (IO.IMAGE, {}),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    DESCRIPTION = "Kling Image Generation Node. Generate an image from a text prompt with an optional reference image."

    def get_response(
-        self, task_id: str, auth_token: Optional[str] = None
+        self, task_id: str, auth_kwargs: Optional[dict[str,str]] = None
    ) -> KlingImageGenerationsResponse:
        return poll_until_finished(
-            auth_token,
+            auth_kwargs,
            ApiEndpoint(
                path=f"{PATH_IMAGE_GENERATIONS}/{task_id}",
                method=HttpMethod.GET,
@@ -1491,7 +1557,7 @@ class KlingImageGenerationNode(KlingImageGenerationBase):
        n: int,
        aspect_ratio: KlingImageGenAspectRatio,
        image: Optional[torch.Tensor] = None,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ):
        self.validate_prompt(prompt, negative_prompt)

@@ -1516,14 +1582,14 @@ class KlingImageGenerationNode(KlingImageGenerationBase):
                n=n,
                aspect_ratio=aspect_ratio,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        task_creation_response = initial_operation.execute()
        validate_task_creation_response(task_creation_response)
        task_id = task_creation_response.data.task_id

-        final_response = self.get_response(task_id, auth_token)
+        final_response = self.get_response(task_id, auth_kwargs=kwargs)
        validate_image_result_response(final_response)

        images = get_images_from_response(final_response)
--- a/comfy_api_nodes/nodes_luma.py
+++ b/comfy_api_nodes/nodes_luma.py
@@ -1,4 +1,6 @@
+from __future__ import annotations
 from inspect import cleandoc
+from typing import Optional
 from comfy.comfy_types.node_typing import IO, ComfyNodeABC
 from comfy_api.input_impl.video_types import VideoFromFile
 from comfy_api_nodes.apis.luma_api import (
@@ -201,6 +203,7 @@ class LumaImageGenerationNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -214,7 +217,6 @@ class LumaImageGenerationNode(ComfyNodeABC):
        image_luma_ref: LumaReferenceChain = None,
        style_image: torch.Tensor = None,
        character_image: torch.Tensor = None,
-        auth_token=None,
        **kwargs,
    ):
        validate_string(prompt, strip_whitespace=True, min_length=3)
@@ -222,19 +224,19 @@ class LumaImageGenerationNode(ComfyNodeABC):
        api_image_ref = None
        if image_luma_ref is not None:
            api_image_ref = self._convert_luma_refs(
-                image_luma_ref, max_refs=4, auth_token=auth_token
+                image_luma_ref, max_refs=4, auth_kwargs=kwargs,
            )
        # handle style_luma_ref
        api_style_ref = None
        if style_image is not None:
            api_style_ref = self._convert_style_image(
-                style_image, weight=style_image_weight, auth_token=auth_token
+                style_image, weight=style_image_weight, auth_kwargs=kwargs,
            )
        # handle character_ref images
        character_ref = None
        if character_image is not None:
            download_urls = upload_images_to_comfyapi(
-                character_image, max_images=4, auth_token=auth_token
+                character_image, max_images=4, auth_kwargs=kwargs,
            )
            character_ref = LumaCharacterRef(
                identity0=LumaImageIdentity(images=download_urls)
@@ -255,7 +257,7 @@ class LumaImageGenerationNode(ComfyNodeABC):
                style_ref=api_style_ref,
                character_ref=character_ref,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api: LumaGeneration = operation.execute()

@@ -269,7 +271,7 @@ class LumaImageGenerationNode(ComfyNodeABC):
            completed_statuses=[LumaState.completed],
            failed_statuses=[LumaState.failed],
            status_extractor=lambda x: x.state,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll = operation.execute()

@@ -278,13 +280,13 @@ class LumaImageGenerationNode(ComfyNodeABC):
        return (img,)

    def _convert_luma_refs(
-        self, luma_ref: LumaReferenceChain, max_refs: int, auth_token=None
+        self, luma_ref: LumaReferenceChain, max_refs: int, auth_kwargs: Optional[dict[str,str]] = None
    ):
        luma_urls = []
        ref_count = 0
        for ref in luma_ref.refs:
            download_urls = upload_images_to_comfyapi(
-                ref.image, max_images=1, auth_token=auth_token
+                ref.image, max_images=1, auth_kwargs=auth_kwargs
            )
            luma_urls.append(download_urls[0])
            ref_count += 1
@@ -293,12 +295,12 @@ class LumaImageGenerationNode(ComfyNodeABC):
        return luma_ref.create_api_model(download_urls=luma_urls, max_refs=max_refs)

    def _convert_style_image(
-        self, style_image: torch.Tensor, weight: float, auth_token=None
+        self, style_image: torch.Tensor, weight: float, auth_kwargs: Optional[dict[str,str]] = None
    ):
        chain = LumaReferenceChain(
            first_ref=LumaReference(image=style_image, weight=weight)
        )
-        return self._convert_luma_refs(chain, max_refs=1, auth_token=auth_token)
+        return self._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs)


 class LumaImageModifyNode(ComfyNodeABC):
@@ -350,6 +352,7 @@ class LumaImageModifyNode(ComfyNodeABC):
            "optional": {},
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -360,12 +363,11 @@ class LumaImageModifyNode(ComfyNodeABC):
        image: torch.Tensor,
        image_weight: float,
        seed,
-        auth_token=None,
        **kwargs,
    ):
        # first, upload image
        download_urls = upload_images_to_comfyapi(
-            image, max_images=1, auth_token=auth_token
+            image, max_images=1, auth_kwargs=kwargs,
        )
        image_url = download_urls[0]
        # next, make Luma call with download url provided
@@ -383,7 +385,7 @@ class LumaImageModifyNode(ComfyNodeABC):
                    url=image_url, weight=round(max(min(1.0-image_weight, 0.98), 0.0), 2)
                ),
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api: LumaGeneration = operation.execute()

@@ -397,7 +399,7 @@ class LumaImageModifyNode(ComfyNodeABC):
            completed_statuses=[LumaState.completed],
            failed_statuses=[LumaState.failed],
            status_extractor=lambda x: x.state,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll = operation.execute()

@@ -470,6 +472,7 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -483,7 +486,6 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC):
        loop: bool,
        seed,
        luma_concepts: LumaConceptChain = None,
-        auth_token=None,
        **kwargs,
    ):
        validate_string(prompt, strip_whitespace=False, min_length=3)
@@ -506,7 +508,7 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC):
                loop=loop,
                concepts=luma_concepts.create_api_model() if luma_concepts else None,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api: LumaGeneration = operation.execute()

@@ -520,7 +522,7 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC):
            completed_statuses=[LumaState.completed],
            failed_statuses=[LumaState.failed],
            status_extractor=lambda x: x.state,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll = operation.execute()

@@ -594,6 +596,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -608,14 +611,13 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC):
        first_image: torch.Tensor = None,
        last_image: torch.Tensor = None,
        luma_concepts: LumaConceptChain = None,
-        auth_token=None,
        **kwargs,
    ):
        if first_image is None and last_image is None:
            raise Exception(
                "At least one of first_image and last_image requires an input."
            )
-        keyframes = self._convert_to_keyframes(first_image, last_image, auth_token)
+        keyframes = self._convert_to_keyframes(first_image, last_image, auth_kwargs=kwargs)
        duration = duration if model != LumaVideoModel.ray_1_6 else None
        resolution = resolution if model != LumaVideoModel.ray_1_6 else None

@@ -636,7 +638,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC):
                keyframes=keyframes,
                concepts=luma_concepts.create_api_model() if luma_concepts else None,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api: LumaGeneration = operation.execute()

@@ -650,7 +652,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC):
            completed_statuses=[LumaState.completed],
            failed_statuses=[LumaState.failed],
            status_extractor=lambda x: x.state,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll = operation.execute()

@@ -661,7 +663,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC):
        self,
        first_image: torch.Tensor = None,
        last_image: torch.Tensor = None,
-        auth_token=None,
+        auth_kwargs: Optional[dict[str,str]] = None,
    ):
        if first_image is None and last_image is None:
            return None
@@ -669,12 +671,12 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC):
        frame1 = None
        if first_image is not None:
            download_urls = upload_images_to_comfyapi(
-                first_image, max_images=1, auth_token=auth_token
+                first_image, max_images=1, auth_kwargs=auth_kwargs,
            )
            frame0 = LumaImageReference(type="image", url=download_urls[0])
        if last_image is not None:
            download_urls = upload_images_to_comfyapi(
-                last_image, max_images=1, auth_token=auth_token
+                last_image, max_images=1, auth_kwargs=auth_kwargs,
            )
            frame1 = LumaImageReference(type="image", url=download_urls[0])
        return LumaKeyframes(frame0=frame0, frame1=frame1)
--- a/comfy_api_nodes/nodes_minimax.py
+++ b/comfy_api_nodes/nodes_minimax.py
@@ -67,6 +67,7 @@ class MinimaxTextToVideoNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -84,7 +85,7 @@ class MinimaxTextToVideoNode:
        model="T2V-01",
        image: torch.Tensor=None, # used for ImageToVideo
        subject: torch.Tensor=None, # used for SubjectToVideo
-        auth_token=None,
+        **kwargs,
    ):
        '''
        Function used between MiniMax nodes - supports T2V, I2V, and S2V, based on provided arguments.
@@ -94,12 +95,12 @@ class MinimaxTextToVideoNode:
        # upload image, if passed in
        image_url = None
        if image is not None:
-            image_url = upload_images_to_comfyapi(image, max_images=1, auth_token=auth_token)[0]
+            image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs)[0]

        # TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model
        subject_reference = None
        if subject is not None:
-            subject_url = upload_images_to_comfyapi(subject, max_images=1, auth_token=auth_token)[0]
+            subject_url = upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=kwargs)[0]
            subject_reference = [SubjectReferenceItem(image=subject_url)]


@@ -118,7 +119,7 @@ class MinimaxTextToVideoNode:
                subject_reference=subject_reference,
                prompt_optimizer=None,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response = video_generate_operation.execute()

@@ -137,7 +138,7 @@ class MinimaxTextToVideoNode:
            completed_statuses=["Success"],
            failed_statuses=["Fail"],
            status_extractor=lambda x: x.status.value,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        task_result = video_generate_operation.execute()

@@ -153,7 +154,7 @@ class MinimaxTextToVideoNode:
                query_params={"file_id": int(file_id)},
            ),
            request=EmptyRequest(),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        file_result = file_retrieve_operation.execute()

@@ -221,6 +222,7 @@ class MinimaxImageToVideoNode(MinimaxTextToVideoNode):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -279,6 +281,7 @@ class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@@ -93,7 +93,10 @@ class OpenAIDalle2(ComfyNodeABC):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = (IO.IMAGE,)
@@ -110,7 +113,7 @@ class OpenAIDalle2(ComfyNodeABC):
        mask=None,
        n=1,
        size="1024x1024",
-        auth_token=None,
+        **kwargs
    ):
        validate_string(prompt, strip_whitespace=False)
        model = "dall-e-2"
@@ -168,7 +171,7 @@ class OpenAIDalle2(ComfyNodeABC):
                else None
            ),
            content_type=content_type,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        response = operation.execute()
@@ -236,7 +239,10 @@ class OpenAIDalle3(ComfyNodeABC):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = (IO.IMAGE,)
@@ -252,7 +258,7 @@ class OpenAIDalle3(ComfyNodeABC):
        style="natural",
        quality="standard",
        size="1024x1024",
-        auth_token=None,
+        **kwargs
    ):
        validate_string(prompt, strip_whitespace=False)
        model = "dall-e-3"
@@ -273,7 +279,7 @@ class OpenAIDalle3(ComfyNodeABC):
                style=style,
                seed=seed,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        response = operation.execute()
@@ -366,7 +372,10 @@ class OpenAIGPTImage1(ComfyNodeABC):
                    },
                ),
            },
-            "hidden": {"auth_token": "AUTH_TOKEN_COMFY_ORG"},
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+            },
        }

    RETURN_TYPES = (IO.IMAGE,)
@@ -385,7 +394,7 @@ class OpenAIGPTImage1(ComfyNodeABC):
        mask=None,
        n=1,
        size="1024x1024",
-        auth_token=None,
+        **kwargs
    ):
        validate_string(prompt, strip_whitespace=False)
        model = "gpt-image-1"
@@ -462,7 +471,7 @@ class OpenAIGPTImage1(ComfyNodeABC):
            ),
            files=files if files else None,
            content_type=content_type,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

        response = operation.execute()
--- a/comfy_api_nodes/nodes_pika.py
+++ b/comfy_api_nodes/nodes_pika.py
@@ -3,6 +3,7 @@ Pika x ComfyUI API Nodes

 Pika API docs: https://pika-827374fb.mintlify.app/api-reference
 """
+from __future__ import annotations

 import io
 from typing import Optional, TypeVar
@@ -120,7 +121,7 @@ class PikaNodeBase(ComfyNodeABC):
    RETURN_TYPES = ("VIDEO",)

    def poll_for_task_status(
-        self, task_id: str, auth_token: str
+        self, task_id: str, auth_kwargs: Optional[dict[str,str]] = None
    ) -> PikaGenerateResponse:
        polling_operation = PollingOperation(
            poll_endpoint=ApiEndpoint(
@@ -139,20 +140,20 @@ class PikaNodeBase(ComfyNodeABC):
            progress_extractor=lambda response: (
                response.progress if hasattr(response, "progress") else None
            ),
-            auth_token=auth_token,
+            auth_kwargs=auth_kwargs,
        )
        return polling_operation.execute()

    def execute_task(
        self,
        initial_operation: SynchronousOperation[R, PikaGenerateResponse],
-        auth_token: Optional[str] = None,
+        auth_kwargs: Optional[dict[str,str]] = None,
    ) -> tuple[VideoFromFile]:
        """Executes the initial operation then polls for the task status until it is completed.

        Args:
            initial_operation: The initial operation to execute.
-            auth_token: The authentication token to use for the API call.
+            auth_kwargs: The authentication token(s) to use for the API call.

        Returns:
            A tuple containing the video file as a VIDEO output.
@@ -164,7 +165,7 @@ class PikaNodeBase(ComfyNodeABC):
            raise PikaApiError(error_msg)

        task_id = initial_response.video_id
-        final_response = self.poll_for_task_status(task_id, auth_token)
+        final_response = self.poll_for_task_status(task_id, auth_kwargs)
        if not is_valid_video_response(final_response):
            error_msg = (
                f"Pika task {task_id} succeeded but no video data found in response."
@@ -193,6 +194,7 @@ class PikaImageToVideoV2_2(PikaNodeBase):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -206,7 +208,7 @@ class PikaImageToVideoV2_2(PikaNodeBase):
        seed: int,
        resolution: str,
        duration: int,
-        auth_token: Optional[str] = None,
+        **kwargs
    ) -> tuple[VideoFromFile]:
        # Convert image to BytesIO
        image_bytes_io = tensor_to_bytesio(image)
@@ -233,10 +235,10 @@ class PikaImageToVideoV2_2(PikaNodeBase):
            request=pika_request_data,
            files=pika_files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

-        return self.execute_task(initial_operation, auth_token)
+        return self.execute_task(initial_operation, auth_kwargs=kwargs)


 class PikaTextToVideoNodeV2_2(PikaNodeBase):
@@ -259,6 +261,7 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -272,7 +275,7 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase):
        resolution: str,
        duration: int,
        aspect_ratio: float,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile]:
        initial_operation = SynchronousOperation(
            endpoint=ApiEndpoint(
@@ -289,11 +292,11 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase):
                duration=duration,
                aspectRatio=aspect_ratio,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
            content_type="application/x-www-form-urlencoded",
        )

-        return self.execute_task(initial_operation, auth_token)
+        return self.execute_task(initial_operation, auth_kwargs=kwargs)


 class PikaScenesV2_2(PikaNodeBase):
@@ -336,6 +339,7 @@ class PikaScenesV2_2(PikaNodeBase):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -355,7 +359,7 @@ class PikaScenesV2_2(PikaNodeBase):
        image_ingredient_3: Optional[torch.Tensor] = None,
        image_ingredient_4: Optional[torch.Tensor] = None,
        image_ingredient_5: Optional[torch.Tensor] = None,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile]:
        # Convert all passed images to BytesIO
        all_image_bytes_io = []
@@ -396,10 +400,10 @@ class PikaScenesV2_2(PikaNodeBase):
            request=pika_request_data,
            files=pika_files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

-        return self.execute_task(initial_operation, auth_token)
+        return self.execute_task(initial_operation, auth_kwargs=kwargs)


 class PikAdditionsNode(PikaNodeBase):
@@ -434,6 +438,7 @@ class PikAdditionsNode(PikaNodeBase):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -446,7 +451,7 @@ class PikAdditionsNode(PikaNodeBase):
        prompt_text: str,
        negative_prompt: str,
        seed: int,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile]:
        # Convert video to BytesIO
        video_bytes_io = io.BytesIO()
@@ -479,10 +484,10 @@ class PikAdditionsNode(PikaNodeBase):
            request=pika_request_data,
            files=pika_files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

-        return self.execute_task(initial_operation, auth_token)
+        return self.execute_task(initial_operation, auth_kwargs=kwargs)


 class PikaSwapsNode(PikaNodeBase):
@@ -526,6 +531,7 @@ class PikaSwapsNode(PikaNodeBase):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -540,7 +546,7 @@ class PikaSwapsNode(PikaNodeBase):
        prompt_text: str,
        negative_prompt: str,
        seed: int,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile]:
        # Convert video to BytesIO
        video_bytes_io = io.BytesIO()
@@ -583,10 +589,10 @@ class PikaSwapsNode(PikaNodeBase):
            request=pika_request_data,
            files=pika_files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

-        return self.execute_task(initial_operation, auth_token)
+        return self.execute_task(initial_operation, auth_kwargs=kwargs)


 class PikaffectsNode(PikaNodeBase):
@@ -630,6 +636,7 @@ class PikaffectsNode(PikaNodeBase):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -642,7 +649,7 @@ class PikaffectsNode(PikaNodeBase):
        prompt_text: str,
        negative_prompt: str,
        seed: int,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile]:

        initial_operation = SynchronousOperation(
@@ -660,10 +667,10 @@ class PikaffectsNode(PikaNodeBase):
            ),
            files={"image": ("image.png", tensor_to_bytesio(image), "image/png")},
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

-        return self.execute_task(initial_operation, auth_token)
+        return self.execute_task(initial_operation, auth_kwargs=kwargs)


 class PikaStartEndFrameNode2_2(PikaNodeBase):
@@ -681,6 +688,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -695,7 +703,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase):
        seed: int,
        resolution: str,
        duration: int,
-        auth_token: Optional[str] = None,
+        **kwargs,
    ) -> tuple[VideoFromFile]:

        pika_files = [
@@ -722,10 +730,10 @@ class PikaStartEndFrameNode2_2(PikaNodeBase):
            ),
            files=pika_files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )

-        return self.execute_task(initial_operation, auth_token)
+        return self.execute_task(initial_operation, auth_kwargs=kwargs)


 NODE_CLASS_MAPPINGS = {
--- a/comfy_api_nodes/nodes_pixverse.py
+++ b/comfy_api_nodes/nodes_pixverse.py
@@ -34,7 +34,7 @@ import requests
 from io import BytesIO


-def upload_image_to_pixverse(image: torch.Tensor, auth_token=None):
+def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None):
    # first, upload image to Pixverse and get image id to use in actual generation call
    files = {
        "image": tensor_to_bytesio(image)
@@ -49,7 +49,7 @@ def upload_image_to_pixverse(image: torch.Tensor, auth_token=None):
        request=EmptyRequest(),
        files=files,
        content_type="multipart/form-data",
-        auth_token=auth_token,
+        auth_kwargs=auth_kwargs,
    )
    response_upload: PixverseImageUploadResponse = operation.execute()

@@ -148,6 +148,7 @@ class PixverseTextToVideoNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -161,7 +162,6 @@ class PixverseTextToVideoNode(ComfyNodeABC):
        seed,
        negative_prompt: str=None,
        pixverse_template: int=None,
-        auth_token=None,
        **kwargs,
    ):
        validate_string(prompt, strip_whitespace=False)
@@ -190,7 +190,7 @@ class PixverseTextToVideoNode(ComfyNodeABC):
                template_id=pixverse_template,
                seed=seed,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

@@ -207,7 +207,7 @@ class PixverseTextToVideoNode(ComfyNodeABC):
            completed_statuses=[PixverseStatus.successful],
            failed_statuses=[PixverseStatus.contents_moderation, PixverseStatus.failed, PixverseStatus.deleted],
            status_extractor=lambda x: x.Resp.status,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll = operation.execute()

@@ -278,6 +278,7 @@ class PixverseImageToVideoNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -291,11 +292,10 @@ class PixverseImageToVideoNode(ComfyNodeABC):
        seed,
        negative_prompt: str=None,
        pixverse_template: int=None,
-        auth_token=None,
        **kwargs,
    ):
        validate_string(prompt, strip_whitespace=False)
-        img_id = upload_image_to_pixverse(image, auth_token=auth_token)
+        img_id = upload_image_to_pixverse(image, auth_kwargs=kwargs)

        # 1080p is limited to 5 seconds duration
        # only normal motion_mode supported for 1080p or for non-5 second duration
@@ -322,7 +322,7 @@ class PixverseImageToVideoNode(ComfyNodeABC):
                template_id=pixverse_template,
                seed=seed,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

@@ -339,7 +339,7 @@ class PixverseImageToVideoNode(ComfyNodeABC):
            completed_statuses=[PixverseStatus.successful],
            failed_statuses=[PixverseStatus.contents_moderation, PixverseStatus.failed, PixverseStatus.deleted],
            status_extractor=lambda x: x.Resp.status,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll = operation.execute()

@@ -407,6 +407,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -420,12 +421,11 @@ class PixverseTransitionVideoNode(ComfyNodeABC):
        motion_mode: str,
        seed,
        negative_prompt: str=None,
-        auth_token=None,
        **kwargs,
    ):
        validate_string(prompt, strip_whitespace=False)
-        first_frame_id = upload_image_to_pixverse(first_frame, auth_token=auth_token)
-        last_frame_id = upload_image_to_pixverse(last_frame, auth_token=auth_token)
+        first_frame_id = upload_image_to_pixverse(first_frame, auth_kwargs=kwargs)
+        last_frame_id = upload_image_to_pixverse(last_frame, auth_kwargs=kwargs)

        # 1080p is limited to 5 seconds duration
        # only normal motion_mode supported for 1080p or for non-5 second duration
@@ -452,7 +452,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC):
                negative_prompt=negative_prompt if negative_prompt else None,
                seed=seed,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

@@ -469,7 +469,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC):
            completed_statuses=[PixverseStatus.successful],
            failed_statuses=[PixverseStatus.contents_moderation, PixverseStatus.failed, PixverseStatus.deleted],
            status_extractor=lambda x: x.Resp.status,
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll = operation.execute()

--- a/comfy_api_nodes/nodes_recraft.py
+++ b/comfy_api_nodes/nodes_recraft.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from inspect import cleandoc
 from comfy.utils import ProgressBar
+from comfy_extras.nodes_images import SVG # Added
 from comfy.comfy_types.node_typing import IO
 from comfy_api_nodes.apis.recraft_api import (
    RecraftImageGenerationRequest,
@@ -28,9 +29,6 @@ from comfy_api_nodes.apinode_utils import (
    resize_mask_to_image,
    validate_string,
 )
-import folder_paths
-import json
-import os
 import torch
 from io import BytesIO
 from PIL import UnidentifiedImageError
@@ -43,7 +41,7 @@ def handle_recraft_file_request(
        total_pixels=4096*4096,
        timeout=1024,
        request=None,
-        auth_token=None
+        auth_kwargs: dict[str,str] = None,
    ) -> list[BytesIO]:
        """
        Handle sending common Recraft file-only request to get back file bytes.
@@ -67,7 +65,7 @@ def handle_recraft_file_request(
            request=request,
            files=files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=auth_kwargs,
            multipart_parser=recraft_multipart_parser,
        )
        response: RecraftImageGenerationResponse = operation.execute()
@@ -162,102 +160,6 @@ class handle_recraft_image_output:
            raise Exception("Received output data was not an image; likely an SVG. If you used style_id, make sure it is not a Vector art style.")


-class SVG:
-    """
-    Stores SVG representations via a list of BytesIO objects.
-    """
-    def __init__(self, data: list[BytesIO]):
-        self.data = data
-
-    def combine(self, other: SVG):
-        return SVG(self.data + other.data)
-
-    @staticmethod
-    def combine_all(svgs: list[SVG]):
-        all_svgs = []
-        for svg in svgs:
-            all_svgs.extend(svg.data)
-        return SVG(all_svgs)
-
-
-class SaveSVGNode:
-    """
-    Save SVG files on disk.
-    """
-
-    def __init__(self):
-        self.output_dir = folder_paths.get_output_directory()
-        self.type = "output"
-        self.prefix_append = ""
-
-    RETURN_TYPES = ()
-    DESCRIPTION = cleandoc(__doc__ or "")  # Handle potential None value
-    FUNCTION = "save_svg"
-    CATEGORY = "api node/image/Recraft"
-    OUTPUT_NODE = True
-
-    @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "svg": (RecraftIO.SVG,),
-                "filename_prefix": ("STRING", {"default": "svg/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."})
-            },
-            "hidden": {
-                "prompt": "PROMPT",
-                "extra_pnginfo": "EXTRA_PNGINFO"
-            }
-        }
-
-    def save_svg(self, svg: SVG, filename_prefix="svg/ComfyUI", prompt=None, extra_pnginfo=None):
-        filename_prefix += self.prefix_append
-        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
-        results = list()
-
-        # Prepare metadata JSON
-        metadata_dict = {}
-        if prompt is not None:
-            metadata_dict["prompt"] = prompt
-        if extra_pnginfo is not None:
-            metadata_dict.update(extra_pnginfo)
-
-        # Convert metadata to JSON string
-        metadata_json = json.dumps(metadata_dict, indent=2) if metadata_dict else None
-
-        for batch_number, svg_bytes in enumerate(svg.data):
-            filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
-            file = f"{filename_with_batch_num}_{counter:05}_.svg"
-
-            # Read SVG content
-            svg_bytes.seek(0)
-            svg_content = svg_bytes.read().decode('utf-8')
-
-            # Inject metadata if available
-            if metadata_json:
-                # Create metadata element with CDATA section
-                metadata_element = f"""  <metadata>
-    <![CDATA[
-{metadata_json}
-    ]]>
-  </metadata>
-"""
-                # Insert metadata after opening svg tag using regex
-                import re
-                svg_content = re.sub(r'(<svg[^>]*>)', r'\1\n' + metadata_element, svg_content)
-
-            # Write the modified SVG to file
-            with open(os.path.join(full_output_folder, file), 'wb') as svg_file:
-                svg_file.write(svg_content.encode('utf-8'))
-
-            results.append({
-                "filename": file,
-                "subfolder": subfolder,
-                "type": self.type
-            })
-            counter += 1
-        return { "ui": { "images": results } }
-
-
 class RecraftColorRGBNode:
    """
    Create Recraft Color by choosing specific RGB values.
@@ -485,6 +387,7 @@ class RecraftTextToImageNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -497,7 +400,6 @@ class RecraftTextToImageNode:
        recraft_style: RecraftStyle = None,
        negative_prompt: str = None,
        recraft_controls: RecraftControls = None,
-        auth_token=None,
        **kwargs,
    ):
        validate_string(prompt, strip_whitespace=False, max_length=1000)
@@ -530,7 +432,7 @@ class RecraftTextToImageNode:
                style_id=recraft_style.style_id,
                controls=controls_api,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response: RecraftImageGenerationResponse = operation.execute()
        images = []
@@ -620,6 +522,7 @@ class RecraftImageToImageNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -630,7 +533,6 @@ class RecraftImageToImageNode:
        n: int,
        strength: float,
        seed,
-        auth_token=None,
        recraft_style: RecraftStyle = None,
        negative_prompt: str = None,
        recraft_controls: RecraftControls = None,
@@ -668,7 +570,7 @@ class RecraftImageToImageNode:
                image=image[i],
                path="/proxy/recraft/images/imageToImage",
                request=request,
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )
            with handle_recraft_image_output():
                images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0))
@@ -736,6 +638,7 @@ class RecraftImageInpaintingNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -746,7 +649,6 @@ class RecraftImageInpaintingNode:
        prompt: str,
        n: int,
        seed,
-        auth_token=None,
        recraft_style: RecraftStyle = None,
        negative_prompt: str = None,
        **kwargs,
@@ -781,7 +683,7 @@ class RecraftImageInpaintingNode:
                mask=mask[i:i+1],
                path="/proxy/recraft/images/inpaint",
                request=request,
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )
            with handle_recraft_image_output():
                images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0))
@@ -796,8 +698,8 @@ class RecraftTextToVectorNode:
    Generates SVG synchronously based on prompt and resolution.
    """

-    RETURN_TYPES = (RecraftIO.SVG,)
-    DESCRIPTION = cleandoc(__doc__ or "")  # Handle potential None value
+    RETURN_TYPES = ("SVG",) # Changed
+    DESCRIPTION = cleandoc(__doc__ or "") if 'cleandoc' in globals() else __doc__ # Keep cleandoc if other nodes use it
    FUNCTION = "api_call"
    API_NODE = True
    CATEGORY = "api node/image/Recraft"
@@ -860,6 +762,7 @@ class RecraftTextToVectorNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -872,7 +775,6 @@ class RecraftTextToVectorNode:
        seed,
        negative_prompt: str = None,
        recraft_controls: RecraftControls = None,
-        auth_token=None,
        **kwargs,
    ):
        validate_string(prompt, strip_whitespace=False, max_length=1000)
@@ -903,7 +805,7 @@ class RecraftTextToVectorNode:
                substyle=recraft_style.substyle,
                controls=controls_api,
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response: RecraftImageGenerationResponse = operation.execute()
        svg_data = []
@@ -918,8 +820,8 @@ class RecraftVectorizeImageNode:
    Generates SVG synchronously from an input image.
    """

-    RETURN_TYPES = (RecraftIO.SVG,)
-    DESCRIPTION = cleandoc(__doc__ or "")  # Handle potential None value
+    RETURN_TYPES = ("SVG",) # Changed
+    DESCRIPTION = cleandoc(__doc__ or "") if 'cleandoc' in globals() else __doc__ # Keep cleandoc if other nodes use it
    FUNCTION = "api_call"
    API_NODE = True
    CATEGORY = "api node/image/Recraft"
@@ -934,13 +836,13 @@ class RecraftVectorizeImageNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(
        self,
        image: torch.Tensor,
-        auth_token=None,
        **kwargs,
    ):
        svgs = []
@@ -950,7 +852,7 @@ class RecraftVectorizeImageNode:
            sub_bytes = handle_recraft_file_request(
                image=image[i],
                path="/proxy/recraft/images/vectorize",
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )
            svgs.append(SVG(sub_bytes))
            pbar.update(1)
@@ -1015,6 +917,7 @@ class RecraftReplaceBackgroundNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -1024,7 +927,6 @@ class RecraftReplaceBackgroundNode:
        prompt: str,
        n: int,
        seed,
-        auth_token=None,
        recraft_style: RecraftStyle = None,
        negative_prompt: str = None,
        **kwargs,
@@ -1054,7 +956,7 @@ class RecraftReplaceBackgroundNode:
                image=image[i],
                path="/proxy/recraft/images/replaceBackground",
                request=request,
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )
            images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0))
            pbar.update(1)
@@ -1084,13 +986,13 @@ class RecraftRemoveBackgroundNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(
        self,
        image: torch.Tensor,
-        auth_token=None,
        **kwargs,
    ):
        images = []
@@ -1100,7 +1002,7 @@ class RecraftRemoveBackgroundNode:
            sub_bytes = handle_recraft_file_request(
                image=image[i],
                path="/proxy/recraft/images/removeBackground",
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )
            images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0))
            pbar.update(1)
@@ -1135,13 +1037,13 @@ class RecraftCrispUpscaleNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(
        self,
        image: torch.Tensor,
-        auth_token=None,
        **kwargs,
    ):
        images = []
@@ -1151,7 +1053,7 @@ class RecraftCrispUpscaleNode:
            sub_bytes = handle_recraft_file_request(
                image=image[i],
                path=self.RECRAFT_PATH,
-                auth_token=auth_token,
+                auth_kwargs=kwargs,
            )
            images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0))
            pbar.update(1)
@@ -1193,7 +1095,6 @@ NODE_CLASS_MAPPINGS = {
    "RecraftStyleV3InfiniteStyleLibrary": RecraftStyleInfiniteStyleLibrary,
    "RecraftColorRGB": RecraftColorRGBNode,
    "RecraftControls": RecraftControlsNode,
-    "SaveSVG": SaveSVGNode,
 }

 # A dictionary that contains the friendly/humanly readable titles for the nodes
@@ -1213,5 +1114,4 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "RecraftStyleV3InfiniteStyleLibrary": "Recraft Style - Infinite Style Library",
    "RecraftColorRGB": "Recraft Color RGB",
    "RecraftControls": "Recraft Controls",
-    "SaveSVG": "Save SVG",
 }
--- a/comfy_api_nodes/nodes_stability.py
+++ b/comfy_api_nodes/nodes_stability.py
@@ -120,12 +120,13 @@ class StabilityStableImageUltraNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(self, prompt: str, aspect_ratio: str, style_preset: str, seed: int,
                 negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None,
-                 auth_token=None):
+                 **kwargs):
        validate_string(prompt, strip_whitespace=False)
        # prepare image binary if image present
        image_binary = None
@@ -160,7 +161,7 @@ class StabilityStableImageUltraNode:
            ),
            files=files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

@@ -252,12 +253,13 @@ class StabilityStableImageSD_3_5Node:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(self, model: str, prompt: str, aspect_ratio: str, style_preset: str, seed: int, cfg_scale: float,
                 negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None,
-                 auth_token=None):
+                 **kwargs):
        validate_string(prompt, strip_whitespace=False)
        # prepare image binary if image present
        image_binary = None
@@ -298,7 +300,7 @@ class StabilityStableImageSD_3_5Node:
            ),
            files=files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

@@ -368,11 +370,12 @@ class StabilityUpscaleConservativeNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(self, image: torch.Tensor, prompt: str, creativity: float, seed: int, negative_prompt: str=None,
-                 auth_token=None):
+                 **kwargs):
        validate_string(prompt, strip_whitespace=False)
        image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read()

@@ -398,7 +401,7 @@ class StabilityUpscaleConservativeNode:
            ),
            files=files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

@@ -473,11 +476,12 @@ class StabilityUpscaleCreativeNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(self, image: torch.Tensor, prompt: str, creativity: float, style_preset: str, seed: int, negative_prompt: str=None,
-                 auth_token=None):
+                 **kwargs):
        validate_string(prompt, strip_whitespace=False)
        image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read()

@@ -506,7 +510,7 @@ class StabilityUpscaleCreativeNode:
            ),
            files=files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

@@ -521,7 +525,7 @@ class StabilityUpscaleCreativeNode:
            completed_statuses=[StabilityPollStatus.finished],
            failed_statuses=[StabilityPollStatus.failed],
            status_extractor=lambda x: get_async_dummy_status(x),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_poll: StabilityResultsGetResponse = operation.execute()

@@ -555,11 +559,12 @@ class StabilityUpscaleFastNode:
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

    def api_call(self, image: torch.Tensor,
-                 auth_token=None):
+                 **kwargs):
        image_binary = tensor_to_bytesio(image, total_pixels=4096*4096).read()

        files = {
@@ -576,7 +581,7 @@ class StabilityUpscaleFastNode:
            request=EmptyRequest(),
            files=files,
            content_type="multipart/form-data",
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
        )
        response_api = operation.execute()

--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@@ -114,6 +114,7 @@ class VeoVideoGenerationNode(ComfyNodeABC):
            },
            "hidden": {
                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
            },
        }

@@ -133,7 +134,7 @@ class VeoVideoGenerationNode(ComfyNodeABC):
        person_generation="ALLOW",
        seed=0,
        image=None,
-        auth_token=None,
+        **kwargs,
    ):
        # Prepare the instances for the request
        instances = []
@@ -179,7 +180,7 @@ class VeoVideoGenerationNode(ComfyNodeABC):
                instances=instances,
                parameters=parameters
            ),
-            auth_token=auth_token
+            auth_kwargs=kwargs,
        )

        initial_response = initial_operation.execute()
@@ -213,7 +214,7 @@ class VeoVideoGenerationNode(ComfyNodeABC):
            request=Veo2GenVidPollRequest(
                operationName=operation_name
            ),
-            auth_token=auth_token,
+            auth_kwargs=kwargs,
            poll_interval=5.0
        )

--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@@ -0,0 +1,49 @@
+import torch
+import comfy.model_management
+import node_helpers
+
+class TextEncodeAceStepAudio:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP", ),
+            "tags": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            "lyrics": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            "lyrics_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
+            }}
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning"
+
+    def encode(self, clip, tags, lyrics, lyrics_strength):
+        tokens = clip.tokenize(tags, lyrics=lyrics)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
+        return (conditioning, )
+
+
+class EmptyAceStepLatentAudio:
+    def __init__(self):
+        self.device = comfy.model_management.intermediate_device()
+
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"seconds": ("FLOAT", {"default": 120.0, "min": 1.0, "max": 1000.0, "step": 0.1}),
+                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}),
+                             }}
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "generate"
+
+    CATEGORY = "latent/audio"
+
+    def generate(self, seconds, batch_size):
+        length = int(seconds * 44100 / 512 / 8)
+        latent = torch.zeros([batch_size, 8, 16, length], device=self.device)
+        return ({"samples": latent, "type": "audio"}, )
+
+
+NODE_CLASS_MAPPINGS = {
+    "TextEncodeAceStepAudio": TextEncodeAceStepAudio,
+    "EmptyAceStepLatentAudio": EmptyAceStepLatentAudio,
+}
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@@ -10,6 +10,9 @@ from PIL.PngImagePlugin import PngInfo
 import numpy as np
 import json
 import os
+import re
+from io import BytesIO
+from inspect import cleandoc

 from comfy.comfy_types import FileLocator

@@ -190,10 +193,109 @@ class SaveAnimatedPNG:

        return { "ui": { "images": results, "animated": (True,)} }

+class SVG:
+    """
+    Stores SVG representations via a list of BytesIO objects.
+    """
+    def __init__(self, data: list[BytesIO]):
+        self.data = data
+
+    def combine(self, other: 'SVG') -> 'SVG':
+        return SVG(self.data + other.data)
+
+    @staticmethod
+    def combine_all(svgs: list['SVG']) -> 'SVG':
+        all_svgs_list: list[BytesIO] = []
+        for svg_item in svgs:
+            all_svgs_list.extend(svg_item.data)
+        return SVG(all_svgs_list)
+
+class SaveSVGNode:
+    """
+    Save SVG files on disk.
+    """
+
+    def __init__(self):
+        self.output_dir = folder_paths.get_output_directory()
+        self.type = "output"
+        self.prefix_append = ""
+
+    RETURN_TYPES = ()
+    DESCRIPTION = cleandoc(__doc__ or "")  # Handle potential None value
+    FUNCTION = "save_svg"
+    CATEGORY = "image/save" # Changed
+    OUTPUT_NODE = True
+
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "svg": ("SVG",), # Changed
+                "filename_prefix": ("STRING", {"default": "svg/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."})
+            },
+            "hidden": {
+                "prompt": "PROMPT",
+                "extra_pnginfo": "EXTRA_PNGINFO"
+            }
+        }
+
+    def save_svg(self, svg: SVG, filename_prefix="svg/ComfyUI", prompt=None, extra_pnginfo=None):
+        filename_prefix += self.prefix_append
+        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
+        results = list()
+
+        # Prepare metadata JSON
+        metadata_dict = {}
+        if prompt is not None:
+            metadata_dict["prompt"] = prompt
+        if extra_pnginfo is not None:
+            metadata_dict.update(extra_pnginfo)
+
+        # Convert metadata to JSON string
+        metadata_json = json.dumps(metadata_dict, indent=2) if metadata_dict else None
+
+        for batch_number, svg_bytes in enumerate(svg.data):
+            filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
+            file = f"{filename_with_batch_num}_{counter:05}_.svg"
+
+            # Read SVG content
+            svg_bytes.seek(0)
+            svg_content = svg_bytes.read().decode('utf-8')
+
+            # Inject metadata if available
+            if metadata_json:
+                # Create metadata element with CDATA section
+                metadata_element = f"""  <metadata>
+                <![CDATA[
+            {metadata_json}
+                ]]>
+            </metadata>
+            """
+                # Insert metadata after opening svg tag using regex with a replacement function
+                def replacement(match):
+                    # match.group(1) contains the captured <svg> tag
+                    return match.group(1) + '\n' + metadata_element
+
+                # Apply the substitution
+                svg_content = re.sub(r'(<svg[^>]*>)', replacement, svg_content, flags=re.UNICODE)
+
+            # Write the modified SVG to file
+            with open(os.path.join(full_output_folder, file), 'wb') as svg_file:
+                svg_file.write(svg_content.encode('utf-8'))
+
+            results.append({
+                "filename": file,
+                "subfolder": subfolder,
+                "type": self.type
+            })
+            counter += 1
+        return { "ui": { "images": results } }
+
 NODE_CLASS_MAPPINGS = {
    "ImageCrop": ImageCrop,
    "RepeatImageBatch": RepeatImageBatch,
    "ImageFromBatch": ImageFromBatch,
    "SaveAnimatedWEBP": SaveAnimatedWEBP,
    "SaveAnimatedPNG": SaveAnimatedPNG,
+    "SaveSVGNode": SaveSVGNode,
 }
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.32"
+__version__ = "0.3.34"
--- a/execution.py
+++ b/execution.py
@@ -146,6 +146,8 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e
                input_data_all[x] = [unique_id]
            if h[x] == "AUTH_TOKEN_COMFY_ORG":
                input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)]
+            if h[x] == "API_KEY_COMFY_ORG":
+                input_data_all[x] = [extra_data.get("api_key_comfy_org", None)]
    return input_data_all, missing_keys

 map_node_over_list = None #Don't hook this please
--- a/nodes.py
+++ b/nodes.py
@@ -246,6 +246,9 @@ class ConditioningZeroOut:
            pooled_output = d.get("pooled_output", None)
            if pooled_output is not None:
                d["pooled_output"] = torch.zeros_like(pooled_output)
+            conditioning_lyrics = d.get("conditioning_lyrics", None)
+            if conditioning_lyrics is not None:
+                d["conditioning_lyrics"] = torch.zeros_like(conditioning_lyrics)
            n = [torch.zeros_like(t[0]), d]
            c.append(n)
        return (c, )
@@ -917,7 +920,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@@ -2259,6 +2262,7 @@ def init_builtin_extra_nodes():
        "nodes_hidream.py",
        "nodes_fresca.py",
        "nodes_preview_any.py",
+        "nodes_ace.py",
    ]

    import_failed = []
@@ -2285,6 +2289,9 @@ def init_builtin_api_nodes():
        "nodes_pika.py",
    ]

+    if not load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"):
+        return api_nodes_files
+
    import_failed = []
    for node_file in api_nodes_files:
        if not load_custom_node(os.path.join(api_nodes_dir, node_file), module_parent="comfy_api_nodes"):
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.32"
+version = "0.3.34"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.18.9
-comfyui-workflow-templates==0.1.11
+comfyui-frontend-package==1.18.10
+comfyui-workflow-templates==0.1.14
 torch
 torchsde
 torchvision
--- a/server.py
+++ b/server.py
@@ -32,12 +32,13 @@ from app.frontend_management import FrontendManager
 from app.user_manager import UserManager
 from app.model_manager import ModelFileManager
 from app.custom_node_manager import CustomNodeManager
-from typing import Optional
+from typing import Optional, Union
 from api_server.routes.internal.internal_routes import InternalRoutes

 class BinaryEventTypes:
    PREVIEW_IMAGE = 1
    UNENCODED_PREVIEW_IMAGE = 2
+    TEXT = 3

 async def send_socket_catch_exception(function, message):
    try:
@@ -878,3 +879,15 @@ class PromptServer():
                logging.warning(traceback.format_exc())

        return json_data
+
+    def send_progress_text(
+        self, text: Union[bytes, bytearray, str], node_id: str, sid=None
+    ):
+        if isinstance(text, str):
+            text = text.encode("utf-8")
+        node_id_bytes = str(node_id).encode("utf-8")
+
+        # Pack the node_id length as a 4-byte unsigned integer, followed by the node_id bytes
+        message = struct.pack(">I", len(node_id_bytes)) + node_id_bytes + text
+
+        self.send_sync(BinaryEventTypes.TEXT, message, sid)
Author	SHA1	Message	Date
comfyanonymous	158419f3a0	ComfyUI version 0.3.34	2025-05-12 15:58:28 -04:00
comfyanonymous	640c47e7de	Fix torch warning about deprecated function. (#8075 ) Drop support for torch versions below 2.2 on the audio VAEs.	2025-05-12 14:32:01 -04:00
Christian Byrne	31e9e36c94	remove aspect ratio from kling request (#8062 )	2025-05-12 13:32:24 -04:00
comfyanonymous	577de83ca9	ACE VAE works in fp16. (#8055 )	2025-05-11 04:58:00 -04:00
Christian Byrne	3535909eb8	Add support for Comfy API keys (#8041 ) * Handle Comfy API key based authorizaton (#167) Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com> * Bump frontend version to include API key features (#170) * bump templates version --------- Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>	2025-05-10 22:10:58 -04:00
Christian Byrne	235d3901fc	Add method to stream text to node UI (#8018 ) * show text progress preview * include node id in message	2025-05-10 20:40:02 -04:00
comfyanonymous	d42613686f	Fix issue with fp8 ops on some models. (#8045 ) _scaled_mm errors when an input is non contiguous.	2025-05-10 07:52:56 -04:00
Pam	1b3bf0a5da	Fix res_multistep_ancestral sampler (#8030 )	2025-05-09 20:14:13 -04:00
Christian Byrne	ae60b150e5	update node tooltips and validation (#8036 )	2025-05-09 20:02:45 -04:00
blepping	42da274717	Use normal ComfyUI attention in ACE-Steps model (#8023 ) * Use normal ComfyUI attention in ACE-Steps model * Let optimized_attention handle output reshape for ACE	2025-05-09 13:51:02 -04:00
thot experiment	28f178a840	move SVG to core (#7982 ) * move SVG to core * fix workflow embedding w/ unicode characters	2025-05-09 13:46:34 -04:00
comfyanonymous	8ab15c863c	Add --mmap-torch-files to enable use of mmap when loading ckpt/pt (#8021 )	2025-05-09 04:52:47 -04:00
comfyanonymous	924d771e18	Add ACE Step to README. (#8005 )	2025-05-08 08:40:57 -04:00
comfyanonymous	02a1b01aad	ComfyUI version 0.3.33	2025-05-08 07:36:48 -04:00
comfyanonymous	a692c3cca4	Make ACE VAE tiling work. (#8004 )	2025-05-08 07:25:45 -04:00
comfyanonymous	5d3cc85e13	Make japanese hiragana and katakana characters work with ACE. (#7997 )	2025-05-08 03:32:36 -04:00
comfyanonymous	c7c025b8d1	Adjust memory estimation code for ACE VAE. (#7990 )	2025-05-08 01:22:23 -04:00
comfyanonymous	fd08e39588	Make torchaudio not a hard requirement. (#7987 ) Some platforms can't install it apparently so if it's not there it should only break models that actually use it.	2025-05-07 21:37:12 -04:00
comfyanonymous	56b6ee6754	Detection code to make ltxv models without config work. (#7986 )	2025-05-07 21:28:24 -04:00
comfyanonymous	cc33cd3422	Experimental lyrics strength for ACE. (#7984 )	2025-05-07 19:22:07 -04:00
comfyanonymous	b9980592c4	Refuse to load api nodes on old pyav version. (#7981 )	2025-05-07 17:27:16 -04:00
comfyanonymous	16417b40d9	Initial ACE-Step model implementation. (#7972 )	2025-05-07 08:33:34 -04:00
comfyanonymous	271c9c5b9e	Better mem estimation for the LTXV 13B model. (#7963 )	2025-05-06 09:52:37 -04:00
comfyanonymous	a4e679765e	Change chroma to use Flux shift. (#7961 )	2025-05-06 09:00:01 -04:00