From cab61bcd718080998a158aaf560eae274e99892c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 14 Dec 2025 19:03:49 +0100
Subject: [PATCH 1/5] Add x0 Flux pred (+prepare for others)

---
 denoiser.hpp         | 26 ++++++++++++++++++++++++++
 stable-diffusion.cpp | 19 +++++++++++++++----
 stable-diffusion.h   |  1 +
 3 files changed, 42 insertions(+), 4 deletions(-)
diff --git a/denoiser.hpp b/denoiser.hpp
index 32f402786..b0b9391f1 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -481,6 +481,14 @@ struct CompVisVDenoiser : public CompVisDenoiser {
     }
 };
 
+struct ComVisX0Denoiser : public CompVisDenoiser {
+    std::vector<float> get_scalings(float sigma) override {
+        float c_skip = 0.0f;
+        float c_out  = 1.0f;
+        float c_in   = 1.0f;
+    }
+};
+
 struct EDMVDenoiser : public CompVisVDenoiser {
     float min_sigma = 0.002;
     float max_sigma = 120.0;
@@ -568,6 +576,15 @@ struct DiscreteFlowDenoiser : public Denoiser {
     }
 };
 
+struct DiscreteFlowX0Denoiser : public DiscreteFlowDenoiser {
+    std::vector<float> get_scalings(float sigma) override {
+        float c_skip = 0.0f;
+        float c_out  = 1.0f;
+        float c_in   = 1.0f;
+        return {c_skip, c_out, c_in};
+    }
+};
+
 float flux_time_shift(float mu, float sigma, float t) {
     return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma));
 }
@@ -631,6 +648,15 @@ struct FluxFlowDenoiser : public Denoiser {
     }
 };
 
+struct FluxFlowX0Denoiser : public FluxFlowDenoiser {
+    std::vector<float> get_scalings(float sigma) override {
+        float c_skip = 0.0f;
+        float c_out  = 1.0f;
+        float c_in   = 1.0f;
+        return {c_skip, c_out, c_in};
+    }
+};
+
 struct Flux2FlowDenoiser : public FluxFlowDenoiser {
     Flux2FlowDenoiser() = default;
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 2cb588213..324a47205 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -828,7 +828,12 @@ class StableDiffusionGGML {
                         }
                     }
                 } else if (sd_version_is_flux(version)) {
-                    pred_type = FLUX_FLOW_PRED;
+                    if (tensor_storage_map.find("model.diffusion_model.__x0__") != tensor_storage_map.end()) {
+                        pred_type = FLUX_FLOW_X0_PRED;
+                    } else {
+                        pred_type = FLUX_FLOW_PRED;
+                    }
+
                     if (flow_shift == INFINITY) {
                         flow_shift = 1.0f;  // TODO: validate
                         for (const auto& [name, tensor_storage] : tensor_storage_map) {
@@ -871,6 +876,11 @@ class StableDiffusionGGML {
                     denoiser = std::make_shared<Flux2FlowDenoiser>();
                     break;
                 }
+                case FLUX_FLOW_X0_PRED: {
+                    LOG_INFO("running in x0-prediction Flux FLOW mode");
+                    denoiser = std::make_shared<FluxFlowX0Denoiser>();
+                    break;
+                }
                 default: {
                     LOG_ERROR("Unknown predition type %i", pred_type);
                     ggml_free(ctx);
@@ -1316,9 +1326,9 @@ class StableDiffusionGGML {
         uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
 
         if (preview_mode == PREVIEW_PROJ) {
-            int64_t patch_sz                       = 1;
-            const float(*latent_rgb_proj)[channel] = nullptr;
-            float* latent_rgb_bias                 = nullptr;
+            int64_t patch_sz                        = 1;
+            const float (*latent_rgb_proj)[channel] = nullptr;
+            float* latent_rgb_bias                  = nullptr;
 
             if (dim == 128) {
                 if (sd_version_is_flux2(version)) {
@@ -2424,6 +2434,7 @@ const char* prediction_to_str[] = {
     "edm_v",
     "sd3_flow",
     "flux_flow",
+    "flux_flow_x0"
     "flux2_flow",
 };
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index e4abc8dcd..ee56099ba 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -70,6 +70,7 @@ enum prediction_t {
     EDM_V_PRED,
     FLOW_PRED,
     FLUX_FLOW_PRED,
+    FLUX_FLOW_X0_PRED,
     FLUX2_FLOW_PRED,
     PREDICTION_COUNT
 };

From b9da97cc40b83c5bdb1d6b15d01e0e8b1582adaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 14 Dec 2025 19:04:06 +0100
Subject: [PATCH 2/5] Fix convert models with empty tensors

---
 model.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/model.cpp b/model.cpp
index 0480efefb..131de3c32 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1731,7 +1731,14 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
         // tensor_storage.n_dims,
         // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3],
         // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
-
+        
+        if (!tensor->data) {
+            GGML_ASSERT(ggml_nelements(tensor) == 0);
+            // avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors
+            LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
+            tensor->data = ggml_get_mem_buffer(ggml_ctx);
+        }
+        
         *dst_tensor = tensor;
 
         gguf_add_tensor(gguf_ctx, tensor);

From e209ad2c82e56cd2615de9cf807587c8a56df305 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 14 Dec 2025 19:42:11 +0100
Subject: [PATCH 3/5] patch_32 exp support attempt

---
 flux.hpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/flux.hpp b/flux.hpp
index 1df2874ae..d3d1f740d 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -781,7 +781,7 @@ namespace Flux {
         Flux(FluxParams params)
             : params(params) {
             if (params.version == VERSION_CHROMA_RADIANCE) {
-                std::pair<int, int> kernel_size = {(int)params.patch_size, (int)params.patch_size};
+                std::pair<int, int> kernel_size = {16, 16};
                 std::pair<int, int> stride      = kernel_size;
 
                 blocks["img_in_patch"] = std::make_shared<Conv2d>(params.in_channels,
@@ -1068,6 +1068,11 @@ namespace Flux {
             auto img      = pad_to_patch_size(ctx->ggml_ctx, x);
             auto orig_img = img;
 
+            if (patch_size != 16) {
+                int ratio = patch_size / 16;
+                img       = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_NEAREST);
+            }
+
             auto img_in_patch = std::dynamic_pointer_cast<Conv2d>(blocks["img_in_patch"]);
 
             img = img_in_patch->forward(ctx, img);                                                       // [N, hidden_size, H/patch_size, W/patch_size]
@@ -1290,6 +1295,9 @@ namespace Flux {
                     // not schnell
                     flux_params.guidance_embed = true;
                 }
+                if (tensor_name.find("__32x32__") != std::string::npos) {
+                    flux_params.patch_size = 32;
+                }
                 if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
                     // Chroma
                     flux_params.is_chroma = true;

From 0cd491b049d44f5e836f5a3238825df81ecd476f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Mon, 15 Dec 2025 00:18:41 +0100
Subject: [PATCH 4/5] improve support for patch_32

---
 flux.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/flux.hpp b/flux.hpp
index d3d1f740d..6e56bd648 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -1070,7 +1070,10 @@ namespace Flux {
 
             if (patch_size != 16) {
                 int ratio = patch_size / 16;
-                img       = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_NEAREST);
+                // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable
+                // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch?
+                // img = F.interpolate(img, size=(H//2, W//2), mode="nearest")
+                img       = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_BILINEAR);
             }
 
             auto img_in_patch = std::dynamic_pointer_cast<Conv2d>(blocks["img_in_patch"]);

From 9e6df4aebd1112f5695a154c56f81a51da030039 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sat, 20 Dec 2025 00:54:07 +0800
Subject: [PATCH 5/5] follow official pipeline

---
 denoiser.hpp               | 26 --------------------------
 examples/common/common.hpp |  2 +-
 flux.hpp                   | 28 ++++++++++++++++++++++++----
 model.cpp                  |  4 ++--
 stable-diffusion.cpp       | 20 ++++++--------------
 stable-diffusion.h         |  1 -
 6 files changed, 33 insertions(+), 48 deletions(-)

diff --git a/denoiser.hpp b/denoiser.hpp
index b0b9391f1..32f402786 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -481,14 +481,6 @@ struct CompVisVDenoiser : public CompVisDenoiser {
     }
 };
 
-struct ComVisX0Denoiser : public CompVisDenoiser {
-    std::vector<float> get_scalings(float sigma) override {
-        float c_skip = 0.0f;
-        float c_out  = 1.0f;
-        float c_in   = 1.0f;
-    }
-};
-
 struct EDMVDenoiser : public CompVisVDenoiser {
     float min_sigma = 0.002;
     float max_sigma = 120.0;
@@ -576,15 +568,6 @@ struct DiscreteFlowDenoiser : public Denoiser {
     }
 };
 
-struct DiscreteFlowX0Denoiser : public DiscreteFlowDenoiser {
-    std::vector<float> get_scalings(float sigma) override {
-        float c_skip = 0.0f;
-        float c_out  = 1.0f;
-        float c_in   = 1.0f;
-        return {c_skip, c_out, c_in};
-    }
-};
-
 float flux_time_shift(float mu, float sigma, float t) {
     return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma));
 }
@@ -648,15 +631,6 @@ struct FluxFlowDenoiser : public Denoiser {
     }
 };
 
-struct FluxFlowX0Denoiser : public FluxFlowDenoiser {
-    std::vector<float> get_scalings(float sigma) override {
-        float c_skip = 0.0f;
-        float c_out  = 1.0f;
-        float c_in   = 1.0f;
-        return {c_skip, c_out, c_in};
-    }
-};
-
 struct Flux2FlowDenoiser : public FluxFlowDenoiser {
     Flux2FlowDenoiser() = default;
 
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index bf38379d2..b9ac7edc1 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -863,7 +863,7 @@ static bool is_absolute_path(const std::string& p) {
 
 struct SDGenerationParams {
     std::string prompt;
-    std::string prompt_with_lora; // for metadata record only
+    std::string prompt_with_lora;  // for metadata record only
     std::string negative_prompt;
     int clip_skip   = -1;  // <= 0 represents unspecified
     int width       = 512;
diff --git a/flux.hpp b/flux.hpp
index 6e56bd648..7ce263569 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -744,6 +744,8 @@ namespace Flux {
         int64_t nerf_mlp_ratio   = 4;
         int64_t nerf_depth       = 4;
         int64_t nerf_max_freqs   = 8;
+        bool use_x0              = false;
+        bool use_patch_size_32   = false;
     };
 
     struct FluxParams {
@@ -1044,6 +1046,15 @@ namespace Flux {
             return img;
         }
 
+        struct ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx,
+                                               struct ggml_tensor* predicted,
+                                               struct ggml_tensor* noisy,
+                                               struct ggml_tensor* timesteps) {
+            auto x = ggml_sub(ctx->ggml_ctx, noisy, predicted);
+            x      = ggml_div(ctx->ggml_ctx, x, timesteps);
+            return x;
+        }
+
         struct ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx,
                                                     struct ggml_tensor* x,
                                                     struct ggml_tensor* timestep,
@@ -1068,12 +1079,11 @@ namespace Flux {
             auto img      = pad_to_patch_size(ctx->ggml_ctx, x);
             auto orig_img = img;
 
-            if (patch_size != 16) {
-                int ratio = patch_size / 16;
+            if (params.chroma_radiance_params.use_patch_size_32) {
                 // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable
                 // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch?
                 // img = F.interpolate(img, size=(H//2, W//2), mode="nearest")
-                img       = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_BILINEAR);
+                img = ggml_interpolate(ctx->ggml_ctx, img, W / 2, H / 2, C, x->ne[3], GGML_SCALE_MODE_BILINEAR);
             }
 
             auto img_in_patch = std::dynamic_pointer_cast<Conv2d>(blocks["img_in_patch"]);
@@ -1112,6 +1122,10 @@ namespace Flux {
 
             out = nerf_final_layer_conv->forward(ctx, img_dct);  // [N, C, H, W]
 
+            if (params.chroma_radiance_params.use_x0) {
+                out = _apply_x0_residual(ctx, out, orig_img, timestep);
+            }
+
             return out;
         }
 
@@ -1298,8 +1312,14 @@ namespace Flux {
                     // not schnell
                     flux_params.guidance_embed = true;
                 }
+                if (tensor_name.find("__x0__") != std::string::npos) {
+                    LOG_DEBUG("using x0 prediction");
+                    flux_params.chroma_radiance_params.use_x0 = true;
+                }
                 if (tensor_name.find("__32x32__") != std::string::npos) {
-                    flux_params.patch_size = 32;
+                    LOG_DEBUG("using patch size 32 prediction");
+                    flux_params.chroma_radiance_params.use_patch_size_32 = true;
+                    flux_params.patch_size                               = 32;
                 }
                 if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
                     // Chroma
diff --git a/model.cpp b/model.cpp
index 131de3c32..74bcdcfe2 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1731,14 +1731,14 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
         // tensor_storage.n_dims,
         // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3],
         // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
-        
+
         if (!tensor->data) {
             GGML_ASSERT(ggml_nelements(tensor) == 0);
             // avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors
             LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
             tensor->data = ggml_get_mem_buffer(ggml_ctx);
         }
-        
+
         *dst_tensor = tensor;
 
         gguf_add_tensor(gguf_ctx, tensor);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 324a47205..ad8fd5c74 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -695,6 +695,8 @@ class StableDiffusionGGML {
         if (stacked_id) {
             ignore_tensors.insert("pmid.unet.");
         }
+        ignore_tensors.insert("model.diffusion_model.__x0__");
+        ignore_tensors.insert("model.diffusion_model.__32x32__");
 
         if (vae_decode_only) {
             ignore_tensors.insert("first_stage_model.encoder");
@@ -828,11 +830,7 @@ class StableDiffusionGGML {
                         }
                     }
                 } else if (sd_version_is_flux(version)) {
-                    if (tensor_storage_map.find("model.diffusion_model.__x0__") != tensor_storage_map.end()) {
-                        pred_type = FLUX_FLOW_X0_PRED;
-                    } else {
-                        pred_type = FLUX_FLOW_PRED;
-                    }
+                    pred_type = FLUX_FLOW_PRED;
 
                     if (flow_shift == INFINITY) {
                         flow_shift = 1.0f;  // TODO: validate
@@ -876,11 +874,6 @@ class StableDiffusionGGML {
                     denoiser = std::make_shared<Flux2FlowDenoiser>();
                     break;
                 }
-                case FLUX_FLOW_X0_PRED: {
-                    LOG_INFO("running in x0-prediction Flux FLOW mode");
-                    denoiser = std::make_shared<FluxFlowX0Denoiser>();
-                    break;
-                }
                 default: {
                     LOG_ERROR("Unknown predition type %i", pred_type);
                     ggml_free(ctx);
@@ -1326,9 +1319,9 @@ class StableDiffusionGGML {
         uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
 
         if (preview_mode == PREVIEW_PROJ) {
-            int64_t patch_sz                        = 1;
-            const float (*latent_rgb_proj)[channel] = nullptr;
-            float* latent_rgb_bias                  = nullptr;
+            int64_t patch_sz                       = 1;
+            const float(*latent_rgb_proj)[channel] = nullptr;
+            float* latent_rgb_bias                 = nullptr;
 
             if (dim == 128) {
                 if (sd_version_is_flux2(version)) {
@@ -2434,7 +2427,6 @@ const char* prediction_to_str[] = {
     "edm_v",
     "sd3_flow",
     "flux_flow",
-    "flux_flow_x0"
     "flux2_flow",
 };
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index ee56099ba..e4abc8dcd 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -70,7 +70,6 @@ enum prediction_t {
     EDM_V_PRED,
     FLOW_PRED,
     FLUX_FLOW_PRED,
-    FLUX_FLOW_X0_PRED,
     FLUX2_FLOW_PRED,
     PREDICTION_COUNT
 };