|
18 | 18 |
|
19 | 19 | import numpy as np |
20 | 20 | import torch |
| 21 | +import torch.nn.functional as F |
21 | 22 | from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor |
22 | 23 |
|
23 | 24 | from ...image_processor import PipelineImageInput, VaeImageProcessor |
24 | 25 | from ...loaders import QwenImageLoraLoaderMixin |
25 | 26 | from ...models import AutoencoderKLQwenImage, QwenImageTransformer2DModel |
26 | 27 | from ...models.controlnets.controlnet_qwenimage import QwenImageControlNetModel, QwenImageMultiControlNetModel |
27 | 28 | from ...schedulers import FlowMatchEulerDiscreteScheduler |
28 | | -from ...utils import deprecate, is_torch_xla_available, logging, replace_example_docstring |
| 29 | +from ...utils import is_torch_xla_available, logging, replace_example_docstring |
29 | 30 | from ...utils.torch_utils import randn_tensor |
30 | 31 | from ..pipeline_utils import DiffusionPipeline |
31 | 32 | from .pipeline_output import QwenImagePipelineOutput |
@@ -970,6 +971,14 @@ def __call__( |
970 | 971 | return_dict=False, |
971 | 972 | ) |
972 | 973 |
|
| 974 | + if image_latents is not None: |
| 975 | + padding_size = image_latents.shape[1] |
| 976 | + for i, sample in enumerate(controlnet_block_samples): |
| 977 | + # Pad right with padding_size zeros at dimension 1 of each sample |
| 978 | + pad_tuple = [0] * (2 * sample.dim()) |
| 979 | + pad_tuple[-3] = padding_size |
| 980 | + controlnet_block_samples[i] = F.pad(sample, pad_tuple, mode="constant", value=0) |
| 981 | + |
973 | 982 | with self.transformer.cache_context("cond"): |
974 | 983 | noise_pred = self.transformer( |
975 | 984 | hidden_states=latent_model_input, |
|
0 commit comments