Refactor noise handling and modulation

JerryWu-code · JerryWu-code · commit 5bc676c0ab57 · 2025-12-19T14:56:23.000Z
- Add select_per_token function for per-token value selection
- Separate adaptive modulation logic
- Cleanify t_noisy/clean variable naming
- Move image_noise_mask handler from forward to pipeline
diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -152,6 +152,20 @@ def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tenso
         return output
 
 
+def select_per_token(
+    value_noisy: torch.Tensor,
+    value_clean: torch.Tensor,
+    noise_mask: torch.Tensor,
+    seq_len: int,
+) -> torch.Tensor:
+    noise_mask_expanded = noise_mask.unsqueeze(-1)  # (batch, seq_len, 1)
+    return torch.where(
+        noise_mask_expanded == 1,
+        value_noisy.unsqueeze(1).expand(-1, seq_len, -1),
+        value_clean.unsqueeze(1).expand(-1, seq_len, -1),
+    )
+
+
 class FeedForward(nn.Module):
     def __init__(self, dim: int, hidden_dim: int):
         super().__init__()
@@ -220,10 +234,10 @@ def forward(
         adaln_clean: Optional[torch.Tensor] = None,
     ):
         if self.modulation:
-            if noise_mask is not None and adaln_noisy is not None and adaln_clean is not None:
-                # Per-token modulation based on noise_mask, (batch, seq_len), 1 for noisy tokens, 0 for clean tokens
-                _, seq_len = x.shape[0], x.shape[1]
+            seq_len = x.shape[1]
 
+            if noise_mask is not None:
+                # Per-token modulation: different modulation for noisy/clean tokens
                 mod_noisy = self.adaLN_modulation(adaln_noisy)
                 mod_clean = self.adaLN_modulation(adaln_clean)
 
@@ -236,33 +250,14 @@ def forward(
                 scale_msa_noisy, scale_mlp_noisy = 1.0 + scale_msa_noisy, 1.0 + scale_mlp_noisy
                 scale_msa_clean, scale_mlp_clean = 1.0 + scale_msa_clean, 1.0 + scale_mlp_clean
 
-                noise_mask_expanded = noise_mask.unsqueeze(-1)  # (batch, seq_len, 1)
-                scale_msa = torch.where(
-                    noise_mask_expanded == 1,
-                    scale_msa_noisy.unsqueeze(1).expand(-1, seq_len, -1),
-                    scale_msa_clean.unsqueeze(1).expand(-1, seq_len, -1),
-                )
-                scale_mlp = torch.where(
-                    noise_mask_expanded == 1,
-                    scale_mlp_noisy.unsqueeze(1).expand(-1, seq_len, -1),
-                    scale_mlp_clean.unsqueeze(1).expand(-1, seq_len, -1),
-                )
-                gate_msa = torch.where(
-                    noise_mask_expanded == 1,
-                    gate_msa_noisy.unsqueeze(1).expand(-1, seq_len, -1),
-                    gate_msa_clean.unsqueeze(1).expand(-1, seq_len, -1),
-                )
-                gate_mlp = torch.where(
-                    noise_mask_expanded == 1,
-                    gate_mlp_noisy.unsqueeze(1).expand(-1, seq_len, -1),
-                    gate_mlp_clean.unsqueeze(1).expand(-1, seq_len, -1),
-                )
+                scale_msa = select_per_token(scale_msa_noisy, scale_msa_clean, noise_mask, seq_len)
+                scale_mlp = select_per_token(scale_mlp_noisy, scale_mlp_clean, noise_mask, seq_len)
+                gate_msa = select_per_token(gate_msa_noisy, gate_msa_clean, noise_mask, seq_len)
+                gate_mlp = select_per_token(gate_mlp_noisy, gate_mlp_clean, noise_mask, seq_len)
             else:
-                # Original global modulation
-                assert adaln_input is not None
-                scale_msa, gate_msa, scale_mlp, gate_mlp = (
-                    self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
-                )
+                # Global modulation: same modulation for all tokens (avoid double select)
+                mod = self.adaLN_modulation(adaln_input)
+                scale_msa, gate_msa, scale_mlp, gate_mlp = mod.unsqueeze(1).chunk(4, dim=2)
                 gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
                 scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
 
@@ -297,18 +292,13 @@ def __init__(self, hidden_size, out_channels):
         )
 
     def forward(self, x, c=None, noise_mask=None, c_noisy=None, c_clean=None):
-        if noise_mask is not None and c_noisy is not None and c_clean is not None:
-            # Per-token modulation based on noise_mask
-            _, seq_len = x.shape[0], x.shape[1]
+        seq_len = x.shape[1]
+
+        if noise_mask is not None:
+            # Per-token modulation
             scale_noisy = 1.0 + self.adaLN_modulation(c_noisy)
             scale_clean = 1.0 + self.adaLN_modulation(c_clean)
-
-            noise_mask_expanded = noise_mask.unsqueeze(-1)
-            scale = torch.where(
-                noise_mask_expanded == 1,
-                scale_noisy.unsqueeze(1).expand(-1, seq_len, -1),
-                scale_clean.unsqueeze(1).expand(-1, seq_len, -1),
-            )
+            scale = select_per_token(scale_noisy, scale_clean, noise_mask, seq_len)
         else:
             # Original global modulation
             assert c is not None, "Either c or (c_noisy, c_clean) must be provided"
@@ -900,29 +890,29 @@ def patchify_and_embed_omni(
 
     def forward(
         self,
-        x: List[torch.Tensor],
+        x: Union[List[torch.Tensor], List[List[torch.Tensor]]],
         t,
-        cap_feats: List[torch.Tensor],
+        cap_feats: Union[List[torch.Tensor], List[List[torch.Tensor]]],
         controlnet_block_samples: Optional[Dict[int, torch.Tensor]] = None,
-        cond_latents: Optional[List[List[torch.Tensor]]] = None,
         siglip_feats: Optional[List[List[torch.Tensor]]] = None,
-        patch_size=2,
-        f_patch_size=1,
+        image_noise_mask: Optional[List[List[int]]] = None,
+        patch_size: int = 2,
+        f_patch_size: int = 1,
         return_dict: bool = True,
     ):
         assert patch_size in self.all_patch_size
         assert f_patch_size in self.all_f_patch_size
 
-        # Determine mode based on cond_latents
-        omni_mode = cond_latents is not None
+        # Omni mode: x contains lists (multi-image input)
+        omni_mode = isinstance(x[0], list)
 
         if omni_mode:
             return self._forward_omni(
                 x,
                 t,
                 cap_feats,
-                cond_latents,
                 siglip_feats,
+                image_noise_mask,
                 controlnet_block_samples,
                 patch_size,
                 f_patch_size,
@@ -1061,30 +1051,23 @@ def _forward_basic(
 
     def _forward_omni(
         self,
-        x: List[torch.Tensor],
+        x: List[List[torch.Tensor]],
         t,
         cap_feats: List[List[torch.Tensor]],
-        cond_latents: List[List[torch.Tensor]],
         siglip_feats: List[List[torch.Tensor]],
+        image_noise_mask: List[List[int]],
         controlnet_block_samples: Optional[Dict[int, torch.Tensor]],
         patch_size: int,
         f_patch_size: int,
         return_dict: bool,
     ):
         """Omni mode forward pass with image conditioning."""
         bsz = len(x)
-        device = x[0].device
+        device = x[0][-1].device  # From target latent
 
         # Create dual timestep embeddings: one for noisy tokens (t), one for clean tokens (t=1)
-        t_combined = torch.cat([t, torch.ones_like(t, dtype=t.dtype, device=device)], dim=0)
-        t_combined = t_combined * self.t_scale
-        t_combined = self.t_embedder(t_combined)
-        t_noisy = t_combined[:bsz]  # Original timestep for noisy tokens
-        t_clean = t_combined[bsz:]  # t=1 for clean (condition) tokens
-
-        # Combine condition latents with target latent
-        x = [cond_latents[i] + [x[i]] for i in range(bsz)]
-        image_noise_mask = [[0] * (len(x[i]) - 1) + [1] for i in range(bsz)]
+        t_noisy = self.t_embedder(t * self.t_scale)
+        t_clean = self.t_embedder(torch.ones_like(t) * self.t_scale)
 
         # Patchify and embed for Omni mode
         (
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_omni.py b/src/diffusers/pipelines/z_image/pipeline_z_image_omni.py
@@ -657,12 +657,24 @@ def __call__(
                 latent_model_input = latent_model_input.unsqueeze(2)
                 latent_model_input_list = list(latent_model_input.unbind(dim=0))
 
+                # Combine condition latents with target latent
+                current_batch_size = len(latent_model_input_list)
+                x_combined = [
+                    condition_latents_model_input[i] + [latent_model_input_list[i]]
+                    for i in range(current_batch_size)
+                ]
+                # Create noise mask: 0 for condition images (clean), 1 for target image (noisy)
+                image_noise_mask = [
+                    [0] * len(condition_latents_model_input[i]) + [1]
+                    for i in range(current_batch_size)
+                ]
+
                 model_out_list = self.transformer(
-                    x=latent_model_input_list,
+                    x=x_combined,
                     t=timestep_model_input,
                     cap_feats=prompt_embeds_model_input,
-                    cond_latents=condition_latents_model_input,
                     siglip_feats=condition_siglip_embeds_model_input,
+                    image_noise_mask=image_noise_mask,
                     return_dict=False,
                 )[0]