diff --git a/examples/common/common.hpp b/examples/common/common.hpp index bf38379d2..b9ac7edc1 100644 --- a/examples/common/common.hpp +++ b/examples/common/common.hpp @@ -863,7 +863,7 @@ static bool is_absolute_path(const std::string& p) { struct SDGenerationParams { std::string prompt; - std::string prompt_with_lora; // for metadata record only + std::string prompt_with_lora; // for metadata record only std::string negative_prompt; int clip_skip = -1; // <= 0 represents unspecified int width = 512; diff --git a/flux.hpp b/flux.hpp index 1df2874ae..7ce263569 100644 --- a/flux.hpp +++ b/flux.hpp @@ -744,6 +744,8 @@ namespace Flux { int64_t nerf_mlp_ratio = 4; int64_t nerf_depth = 4; int64_t nerf_max_freqs = 8; + bool use_x0 = false; + bool use_patch_size_32 = false; }; struct FluxParams { @@ -781,7 +783,7 @@ namespace Flux { Flux(FluxParams params) : params(params) { if (params.version == VERSION_CHROMA_RADIANCE) { - std::pair kernel_size = {(int)params.patch_size, (int)params.patch_size}; + std::pair kernel_size = {16, 16}; std::pair stride = kernel_size; blocks["img_in_patch"] = std::make_shared(params.in_channels, @@ -1044,6 +1046,15 @@ namespace Flux { return img; } + struct ggml_tensor* _apply_x0_residual(GGMLRunnerContext* ctx, + struct ggml_tensor* predicted, + struct ggml_tensor* noisy, + struct ggml_tensor* timesteps) { + auto x = ggml_sub(ctx->ggml_ctx, noisy, predicted); + x = ggml_div(ctx->ggml_ctx, x, timesteps); + return x; + } + struct ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* timestep, @@ -1068,6 +1079,13 @@ namespace Flux { auto img = pad_to_patch_size(ctx->ggml_ctx, x); auto orig_img = img; + if (params.chroma_radiance_params.use_patch_size_32) { + // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable + // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch? + // img = F.interpolate(img, size=(H//2, W//2), mode="nearest") + img = ggml_interpolate(ctx->ggml_ctx, img, W / 2, H / 2, C, x->ne[3], GGML_SCALE_MODE_BILINEAR); + } + auto img_in_patch = std::dynamic_pointer_cast(blocks["img_in_patch"]); img = img_in_patch->forward(ctx, img); // [N, hidden_size, H/patch_size, W/patch_size] @@ -1104,6 +1122,10 @@ namespace Flux { out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W] + if (params.chroma_radiance_params.use_x0) { + out = _apply_x0_residual(ctx, out, orig_img, timestep); + } + return out; } @@ -1290,6 +1312,15 @@ namespace Flux { // not schnell flux_params.guidance_embed = true; } + if (tensor_name.find("__x0__") != std::string::npos) { + LOG_DEBUG("using x0 prediction"); + flux_params.chroma_radiance_params.use_x0 = true; + } + if (tensor_name.find("__32x32__") != std::string::npos) { + LOG_DEBUG("using patch size 32 prediction"); + flux_params.chroma_radiance_params.use_patch_size_32 = true; + flux_params.patch_size = 32; + } if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) { // Chroma flux_params.is_chroma = true; diff --git a/model.cpp b/model.cpp index 0480efefb..74bcdcfe2 100644 --- a/model.cpp +++ b/model.cpp @@ -1732,6 +1732,13 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3], // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + if (!tensor->data) { + GGML_ASSERT(ggml_nelements(tensor) == 0); + // avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors + LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str()); + tensor->data = ggml_get_mem_buffer(ggml_ctx); + } + *dst_tensor = tensor; gguf_add_tensor(gguf_ctx, tensor); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 2cb588213..ad8fd5c74 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -695,6 +695,8 @@ class StableDiffusionGGML { if (stacked_id) { ignore_tensors.insert("pmid.unet."); } + ignore_tensors.insert("model.diffusion_model.__x0__"); + ignore_tensors.insert("model.diffusion_model.__32x32__"); if (vae_decode_only) { ignore_tensors.insert("first_stage_model.encoder"); @@ -829,6 +831,7 @@ class StableDiffusionGGML { } } else if (sd_version_is_flux(version)) { pred_type = FLUX_FLOW_PRED; + if (flow_shift == INFINITY) { flow_shift = 1.0f; // TODO: validate for (const auto& [name, tensor_storage] : tensor_storage_map) {