From cab61bcd718080998a158aaf560eae274e99892c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 14 Dec 2025 19:03:49 +0100 Subject: [PATCH 1/5] Add x0 Flux pred (+prepare for others) --- denoiser.hpp | 26 ++++++++++++++++++++++++++ stable-diffusion.cpp | 19 +++++++++++++++---- stable-diffusion.h | 1 + 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/denoiser.hpp b/denoiser.hpp index 32f402786..b0b9391f1 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -481,6 +481,14 @@ struct CompVisVDenoiser : public CompVisDenoiser { } }; +struct ComVisX0Denoiser : public CompVisDenoiser { + std::vector get_scalings(float sigma) override { + float c_skip = 0.0f; + float c_out = 1.0f; + float c_in = 1.0f; + } +}; + struct EDMVDenoiser : public CompVisVDenoiser { float min_sigma = 0.002; float max_sigma = 120.0; @@ -568,6 +576,15 @@ struct DiscreteFlowDenoiser : public Denoiser { } }; +struct DiscreteFlowX0Denoiser : public DiscreteFlowDenoiser { + std::vector get_scalings(float sigma) override { + float c_skip = 0.0f; + float c_out = 1.0f; + float c_in = 1.0f; + return {c_skip, c_out, c_in}; + } +}; + float flux_time_shift(float mu, float sigma, float t) { return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma)); } @@ -631,6 +648,15 @@ struct FluxFlowDenoiser : public Denoiser { } }; +struct FluxFlowX0Denoiser : public FluxFlowDenoiser { + std::vector get_scalings(float sigma) override { + float c_skip = 0.0f; + float c_out = 1.0f; + float c_in = 1.0f; + return {c_skip, c_out, c_in}; + } +}; + struct Flux2FlowDenoiser : public FluxFlowDenoiser { Flux2FlowDenoiser() = default; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 2cb588213..324a47205 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -828,7 +828,12 @@ class StableDiffusionGGML { } } } else if (sd_version_is_flux(version)) { - pred_type = FLUX_FLOW_PRED; + if (tensor_storage_map.find("model.diffusion_model.__x0__") != tensor_storage_map.end()) { + pred_type = FLUX_FLOW_X0_PRED; + } else { + pred_type = FLUX_FLOW_PRED; + } + if (flow_shift == INFINITY) { flow_shift = 1.0f; // TODO: validate for (const auto& [name, tensor_storage] : tensor_storage_map) { @@ -871,6 +876,11 @@ class StableDiffusionGGML { denoiser = std::make_shared(); break; } + case FLUX_FLOW_X0_PRED: { + LOG_INFO("running in x0-prediction Flux FLOW mode"); + denoiser = std::make_shared(); + break; + } default: { LOG_ERROR("Unknown predition type %i", pred_type); ggml_free(ctx); @@ -1316,9 +1326,9 @@ class StableDiffusionGGML { uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; if (preview_mode == PREVIEW_PROJ) { - int64_t patch_sz = 1; - const float(*latent_rgb_proj)[channel] = nullptr; - float* latent_rgb_bias = nullptr; + int64_t patch_sz = 1; + const float (*latent_rgb_proj)[channel] = nullptr; + float* latent_rgb_bias = nullptr; if (dim == 128) { if (sd_version_is_flux2(version)) { @@ -2424,6 +2434,7 @@ const char* prediction_to_str[] = { "edm_v", "sd3_flow", "flux_flow", + "flux_flow_x0" "flux2_flow", }; diff --git a/stable-diffusion.h b/stable-diffusion.h index e4abc8dcd..ee56099ba 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -70,6 +70,7 @@ enum prediction_t { EDM_V_PRED, FLOW_PRED, FLUX_FLOW_PRED, + FLUX_FLOW_X0_PRED, FLUX2_FLOW_PRED, PREDICTION_COUNT }; From b9da97cc40b83c5bdb1d6b15d01e0e8b1582adaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 14 Dec 2025 19:04:06 +0100 Subject: [PATCH 2/5] Fix convert models with empty tensors --- model.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/model.cpp b/model.cpp index 0480efefb..131de3c32 100644 --- a/model.cpp +++ b/model.cpp @@ -1731,7 +1731,14 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type // tensor_storage.n_dims, // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3], // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); - + + if (!tensor->data) { + GGML_ASSERT(ggml_nelements(tensor) == 0); + // avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors + LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str()); + tensor->data = ggml_get_mem_buffer(ggml_ctx); + } + *dst_tensor = tensor; gguf_add_tensor(gguf_ctx, tensor); From e209ad2c82e56cd2615de9cf807587c8a56df305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 14 Dec 2025 19:42:11 +0100 Subject: [PATCH 3/5] patch_32 exp support attempt --- flux.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/flux.hpp b/flux.hpp index 1df2874ae..d3d1f740d 100644 --- a/flux.hpp +++ b/flux.hpp @@ -781,7 +781,7 @@ namespace Flux { Flux(FluxParams params) : params(params) { if (params.version == VERSION_CHROMA_RADIANCE) { - std::pair kernel_size = {(int)params.patch_size, (int)params.patch_size}; + std::pair kernel_size = {16, 16}; std::pair stride = kernel_size; blocks["img_in_patch"] = std::make_shared(params.in_channels, @@ -1068,6 +1068,11 @@ namespace Flux { auto img = pad_to_patch_size(ctx->ggml_ctx, x); auto orig_img = img; + if (patch_size != 16) { + int ratio = patch_size / 16; + img = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_NEAREST); + } + auto img_in_patch = std::dynamic_pointer_cast(blocks["img_in_patch"]); img = img_in_patch->forward(ctx, img); // [N, hidden_size, H/patch_size, W/patch_size] @@ -1290,6 +1295,9 @@ namespace Flux { // not schnell flux_params.guidance_embed = true; } + if (tensor_name.find("__32x32__") != std::string::npos) { + flux_params.patch_size = 32; + } if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) { // Chroma flux_params.is_chroma = true; From 0cd491b049d44f5e836f5a3238825df81ecd476f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 15 Dec 2025 00:18:41 +0100 Subject: [PATCH 4/5] improve support for patch_32 --- flux.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flux.hpp b/flux.hpp index d3d1f740d..6e56bd648 100644 --- a/flux.hpp +++ b/flux.hpp @@ -1070,7 +1070,10 @@ namespace Flux { if (patch_size != 16) { int ratio = patch_size / 16; - img = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_NEAREST); + // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable + // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch? + // img = F.interpolate(img, size=(H//2, W//2), mode="nearest") + img = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_BILINEAR); } auto img_in_patch = std::dynamic_pointer_cast(blocks["img_in_patch"]); From 9e6df4aebd1112f5695a154c56f81a51da030039 Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 20 Dec 2025 00:54:07 +0800 Subject: [PATCH 5/5] follow official pipeline --- denoiser.hpp | 26 -------------------------- examples/common/common.hpp | 2 +- flux.hpp | 28 ++++++++++++++++++++++++---- model.cpp | 4 ++-- stable-diffusion.cpp | 20 ++++++-------------- stable-diffusion.h | 1 - 6 files changed, 33 insertions(+), 48 deletions(-) diff --git a/denoiser.hpp b/denoiser.hpp index b0b9391f1..32f402786 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -481,14 +481,6 @@ struct CompVisVDenoiser : public CompVisDenoiser { } }; -struct ComVisX0Denoiser : public CompVisDenoiser { - std::vector get_scalings(float sigma) override { - float c_skip = 0.0f; - float c_out = 1.0f; - float c_in = 1.0f; - } -}; - struct EDMVDenoiser : public CompVisVDenoiser { float min_sigma = 0.002; float max_sigma = 120.0; @@ -576,15 +568,6 @@ struct DiscreteFlowDenoiser : public Denoiser { } }; -struct DiscreteFlowX0Denoiser : public DiscreteFlowDenoiser { - std::vector get_scalings(float sigma) override { - float c_skip = 0.0f; - float c_out = 1.0f; - float c_in = 1.0f; - return {c_skip, c_out, c_in}; - } -}; - float flux_time_shift(float mu, float sigma, float t) { return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma)); } @@ -648,15 +631,6 @@ struct FluxFlowDenoiser : public Denoiser { } }; -struct FluxFlowX0Denoiser : public FluxFlowDenoiser { - std::vector get_scalings(float sigma) override { - float c_skip = 0.0f; - float c_out = 1.0f; - float c_in = 1.0f; - return {c_skip, c_out, c_in}; - } -}; - struct Flux2FlowDenoiser : public FluxFlowDenoiser { Flux2FlowDenoiser() = default; diff --git a/examples/common/common.hpp b/examples/common/common.hpp index bf38379d2..b9ac7edc1 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -863,7 +863,7 @@ static bool is_absolute_path(const std::string& p) { struct SDGenerationParams { std::string prompt; - std::string prompt_with_lora; // for metadata record only + std::string prompt_with_lora; // for metadata record only std::string negative_prompt; int clip_skip = -1; // <= 0 represents unspecified int width = 512; diff --git a/flux.hpp b/flux.hpp index 6e56bd648..7ce263569 100644 --- a/flux.hpp +++ b/flux.hpp @@ -744,6 +744,8 @@ namespace Flux { int64_t nerf_mlp_ratio = 4; int64_t nerf_depth = 4; int64_t nerf_max_freqs = 8; + bool use_x0 = false; + bool use_patch_size_32 = false; }; struct FluxParams { @@ -1044,6 +1046,15 @@ namespace Flux { return img; } + struct ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx, + struct ggml_tensor* predicted, + struct ggml_tensor* noisy, + struct ggml_tensor* timesteps) { + auto x = ggml_sub(ctx->ggml_ctx, noisy, predicted); + x = ggml_div(ctx->ggml_ctx, x, timesteps); + return x; + } + struct ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* timestep, @@ -1068,12 +1079,11 @@ namespace Flux { auto img = pad_to_patch_size(ctx->ggml_ctx, x); auto orig_img = img; - if (patch_size != 16) { - int ratio = patch_size / 16; + if (params.chroma_radiance_params.use_patch_size_32) { // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch? // img = F.interpolate(img, size=(H//2, W//2), mode="nearest") - img = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_BILINEAR); + img = ggml_interpolate(ctx->ggml_ctx, img, W / 2, H / 2, C, x->ne[3], GGML_SCALE_MODE_BILINEAR); } auto img_in_patch = std::dynamic_pointer_cast(blocks["img_in_patch"]); @@ -1112,6 +1122,10 @@ namespace Flux { out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W] + if (params.chroma_radiance_params.use_x0) { + out = _apply_x0_residual(ctx, out, orig_img, timestep); + } + return out; } @@ -1298,8 +1312,14 @@ namespace Flux { // not schnell flux_params.guidance_embed = true; } + if (tensor_name.find("__x0__") != std::string::npos) { + LOG_DEBUG("using x0 prediction"); + flux_params.chroma_radiance_params.use_x0 = true; + } if (tensor_name.find("__32x32__") != std::string::npos) { - flux_params.patch_size = 32; + LOG_DEBUG("using patch size 32 prediction"); + flux_params.chroma_radiance_params.use_patch_size_32 = true; + flux_params.patch_size = 32; } if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) { // Chroma diff --git a/model.cpp b/model.cpp index 131de3c32..74bcdcfe2 100644 --- a/model.cpp +++ b/model.cpp @@ -1731,14 +1731,14 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type // tensor_storage.n_dims, // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3], // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); - + if (!tensor->data) { GGML_ASSERT(ggml_nelements(tensor) == 0); // avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str()); tensor->data = ggml_get_mem_buffer(ggml_ctx); } - + *dst_tensor = tensor; gguf_add_tensor(gguf_ctx, tensor); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 324a47205..ad8fd5c74 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -695,6 +695,8 @@ class StableDiffusionGGML { if (stacked_id) { ignore_tensors.insert("pmid.unet."); } + ignore_tensors.insert("model.diffusion_model.__x0__"); + ignore_tensors.insert("model.diffusion_model.__32x32__"); if (vae_decode_only) { ignore_tensors.insert("first_stage_model.encoder"); @@ -828,11 +830,7 @@ class StableDiffusionGGML { } } } else if (sd_version_is_flux(version)) { - if (tensor_storage_map.find("model.diffusion_model.__x0__") != tensor_storage_map.end()) { - pred_type = FLUX_FLOW_X0_PRED; - } else { - pred_type = FLUX_FLOW_PRED; - } + pred_type = FLUX_FLOW_PRED; if (flow_shift == INFINITY) { flow_shift = 1.0f; // TODO: validate @@ -876,11 +874,6 @@ class StableDiffusionGGML { denoiser = std::make_shared(); break; } - case FLUX_FLOW_X0_PRED: { - LOG_INFO("running in x0-prediction Flux FLOW mode"); - denoiser = std::make_shared(); - break; - } default: { LOG_ERROR("Unknown predition type %i", pred_type); ggml_free(ctx); @@ -1326,9 +1319,9 @@ class StableDiffusionGGML { uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; if (preview_mode == PREVIEW_PROJ) { - int64_t patch_sz = 1; - const float (*latent_rgb_proj)[channel] = nullptr; - float* latent_rgb_bias = nullptr; + int64_t patch_sz = 1; + const float(*latent_rgb_proj)[channel] = nullptr; + float* latent_rgb_bias = nullptr; if (dim == 128) { if (sd_version_is_flux2(version)) { @@ -2434,7 +2427,6 @@ const char* prediction_to_str[] = { "edm_v", "sd3_flow", "flux_flow", - "flux_flow_x0" "flux2_flow", }; diff --git a/stable-diffusion.h b/stable-diffusion.h index ee56099ba..e4abc8dcd 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -70,7 +70,6 @@ enum prediction_t { EDM_V_PRED, FLOW_PRED, FLUX_FLOW_PRED, - FLUX_FLOW_X0_PRED, FLUX2_FLOW_PRED, PREDICTION_COUNT };