@@ -1720,7 +1720,7 @@ struct LLMEmbedder : public Conditioner {
17201720 std::string current_part;
17211721
17221722 for (char c : curr_text) {
1723- if (c == ' \' ' ) {
1723+ if (c == ' " ' ) {
17241724 if (!current_part.empty ()) {
17251725 parts.push_back (current_part);
17261726 current_part.clear ();
@@ -1741,7 +1741,7 @@ struct LLMEmbedder : public Conditioner {
17411741 for (const auto & part : parts) {
17421742 if (part.empty ())
17431743 continue ;
1744- if (part[0 ] == ' \' ' && part.back () == ' \' ' ) {
1744+ if (part[0 ] == ' " ' && part.back () == ' " ' ) {
17451745 std::string quoted_content = part.substr (1 , part.size () - 2 );
17461746 for (char ch : quoted_content) {
17471747 std::string char_str (1 , ch);
@@ -1778,68 +1778,140 @@ struct LLMEmbedder : public Conditioner {
17781778 bool spell_quotes = false ;
17791779 std::set<int > out_layers;
17801780 if (llm->enable_vision && conditioner_params.ref_images .size () > 0 ) {
1781- LOG_INFO (" QwenImageEditPlusPipeline" );
1782- prompt_template_encode_start_idx = 64 ;
1783- int image_embed_idx = 64 + 6 ;
1784-
1785- int min_pixels = 384 * 384 ;
1786- int max_pixels = 560 * 560 ;
1787- std::string placeholder = " <|image_pad|>" ;
1788- std::string img_prompt;
1789-
1790- for (int i = 0 ; i < conditioner_params.ref_images .size (); i++) {
1791- sd_image_f32_t image = sd_image_t_to_sd_image_f32_t (*conditioner_params.ref_images [i]);
1792- double factor = llm->params .vision .patch_size * llm->params .vision .spatial_merge_size ;
1793- int height = image.height ;
1794- int width = image.width ;
1795- int h_bar = static_cast <int >(std::round (height / factor)) * factor;
1796- int w_bar = static_cast <int >(std::round (width / factor)) * factor;
1797-
1798- if (static_cast <double >(h_bar) * w_bar > max_pixels) {
1799- double beta = std::sqrt ((height * width) / static_cast <double >(max_pixels));
1800- h_bar = std::max (static_cast <int >(factor),
1801- static_cast <int >(std::floor (height / beta / factor)) * static_cast <int >(factor));
1802- w_bar = std::max (static_cast <int >(factor),
1803- static_cast <int >(std::floor (width / beta / factor)) * static_cast <int >(factor));
1804- } else if (static_cast <double >(h_bar) * w_bar < min_pixels) {
1805- double beta = std::sqrt (static_cast <double >(min_pixels) / (height * width));
1806- h_bar = static_cast <int >(std::ceil (height * beta / factor)) * static_cast <int >(factor);
1807- w_bar = static_cast <int >(std::ceil (width * beta / factor)) * static_cast <int >(factor);
1781+ if (sd_version_is_longcat (version)) {
1782+ LOG_INFO (" LongCatEditPipeline" );
1783+ prompt_template_encode_start_idx = 67 ;
1784+ // prompt_template_encode_end_idx = 5;
1785+ int image_embed_idx = 36 + 6 ;
1786+
1787+ int min_pixels = 384 * 384 ;
1788+ int max_pixels = 560 * 560 ;
1789+ std::string placeholder = " <|image_pad|>" ;
1790+ std::string img_prompt;
1791+
1792+
1793+ // Only one image is officicially supported by the model, not sure how it handles multiple images
1794+ for (int i = 0 ; i < conditioner_params.ref_images .size (); i++) {
1795+ sd_image_f32_t image = sd_image_t_to_sd_image_f32_t (*conditioner_params.ref_images [i]);
1796+ double factor = llm->params .vision .patch_size * llm->params .vision .spatial_merge_size ;
1797+ int height = image.height ;
1798+ int width = image.width ;
1799+ int h_bar = static_cast <int >(std::round (height / factor)) * factor;
1800+ int w_bar = static_cast <int >(std::round (width / factor)) * factor;
1801+
1802+ if (static_cast <double >(h_bar) * w_bar > max_pixels) {
1803+ double beta = std::sqrt ((height * width) / static_cast <double >(max_pixels));
1804+ h_bar = std::max (static_cast <int >(factor),
1805+ static_cast <int >(std::floor (height / beta / factor)) * static_cast <int >(factor));
1806+ w_bar = std::max (static_cast <int >(factor),
1807+ static_cast <int >(std::floor (width / beta / factor)) * static_cast <int >(factor));
1808+ } else if (static_cast <double >(h_bar) * w_bar < min_pixels) {
1809+ double beta = std::sqrt (static_cast <double >(min_pixels) / (height * width));
1810+ h_bar = static_cast <int >(std::ceil (height * beta / factor)) * static_cast <int >(factor);
1811+ w_bar = static_cast <int >(std::ceil (width * beta / factor)) * static_cast <int >(factor);
1812+ }
1813+
1814+ LOG_DEBUG (" resize conditioner ref image %d from %dx%d to %dx%d" , i, image.height , image.width , h_bar, w_bar);
1815+
1816+ sd_image_f32_t resized_image = clip_preprocess (image, w_bar, h_bar);
1817+ free (image.data );
1818+ image.data = nullptr ;
1819+
1820+ ggml_tensor* image_tensor = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, resized_image.width , resized_image.height , 3 , 1 );
1821+ sd_image_f32_to_ggml_tensor (resized_image, image_tensor, false );
1822+ free (resized_image.data );
1823+ resized_image.data = nullptr ;
1824+
1825+ ggml_tensor* image_embed = nullptr ;
1826+ llm->encode_image (n_threads, image_tensor, &image_embed, work_ctx);
1827+ image_embeds.emplace_back (image_embed_idx, image_embed);
1828+ image_embed_idx += 1 + image_embed->ne [1 ] + 6 ;
1829+
1830+ img_prompt += " <|vision_start|>" ;
1831+ int64_t num_image_tokens = image_embed->ne [1 ];
1832+ img_prompt.reserve (num_image_tokens * placeholder.size ());
1833+ for (int j = 0 ; j < num_image_tokens; j++) {
1834+ img_prompt += placeholder;
1835+ }
1836+ img_prompt += " <|vision_end|>" ;
18081837 }
18091838
1810- LOG_DEBUG (" resize conditioner ref image %d from %dx%d to %dx%d" , i, image.height , image.width , h_bar, w_bar);
1839+ max_length = 512 ;
1840+ pad = true ;
1841+ spell_quotes = true ;
1842+ prompt = " <|im_start|>system\n As an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n <|im_start|>user\n " ;
1843+ prompt += img_prompt;
1844+
1845+ prompt_attn_range.first = static_cast <int >(prompt.size ());
1846+ prompt += conditioner_params.text ;
1847+ prompt_attn_range.second = static_cast <int >(prompt.size ());
1848+
1849+ prompt += " <|im_end|>\n <|im_start|>assistant\n " ;
1850+
1851+ } else {
1852+ LOG_INFO (" QwenImageEditPlusPipeline" );
1853+ prompt_template_encode_start_idx = 64 ;
1854+ int image_embed_idx = 64 + 6 ;
1855+
1856+ int min_pixels = 384 * 384 ;
1857+ int max_pixels = 560 * 560 ;
1858+ std::string placeholder = " <|image_pad|>" ;
1859+ std::string img_prompt;
1860+
1861+ for (int i = 0 ; i < conditioner_params.ref_images .size (); i++) {
1862+ sd_image_f32_t image = sd_image_t_to_sd_image_f32_t (*conditioner_params.ref_images [i]);
1863+ double factor = llm->params .vision .patch_size * llm->params .vision .spatial_merge_size ;
1864+ int height = image.height ;
1865+ int width = image.width ;
1866+ int h_bar = static_cast <int >(std::round (height / factor)) * factor;
1867+ int w_bar = static_cast <int >(std::round (width / factor)) * factor;
1868+
1869+ if (static_cast <double >(h_bar) * w_bar > max_pixels) {
1870+ double beta = std::sqrt ((height * width) / static_cast <double >(max_pixels));
1871+ h_bar = std::max (static_cast <int >(factor),
1872+ static_cast <int >(std::floor (height / beta / factor)) * static_cast <int >(factor));
1873+ w_bar = std::max (static_cast <int >(factor),
1874+ static_cast <int >(std::floor (width / beta / factor)) * static_cast <int >(factor));
1875+ } else if (static_cast <double >(h_bar) * w_bar < min_pixels) {
1876+ double beta = std::sqrt (static_cast <double >(min_pixels) / (height * width));
1877+ h_bar = static_cast <int >(std::ceil (height * beta / factor)) * static_cast <int >(factor);
1878+ w_bar = static_cast <int >(std::ceil (width * beta / factor)) * static_cast <int >(factor);
1879+ }
1880+
1881+ LOG_DEBUG (" resize conditioner ref image %d from %dx%d to %dx%d" , i, image.height , image.width , h_bar, w_bar);
18111882
1812- sd_image_f32_t resized_image = clip_preprocess (image, w_bar, h_bar);
1813- free (image.data );
1814- image.data = nullptr ;
1883+ sd_image_f32_t resized_image = clip_preprocess (image, w_bar, h_bar);
1884+ free (image.data );
1885+ image.data = nullptr ;
18151886
1816- ggml_tensor* image_tensor = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, resized_image.width , resized_image.height , 3 , 1 );
1817- sd_image_f32_to_ggml_tensor (resized_image, image_tensor, false );
1818- free (resized_image.data );
1819- resized_image.data = nullptr ;
1887+ ggml_tensor* image_tensor = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, resized_image.width , resized_image.height , 3 , 1 );
1888+ sd_image_f32_to_ggml_tensor (resized_image, image_tensor, false );
1889+ free (resized_image.data );
1890+ resized_image.data = nullptr ;
18201891
1821- ggml_tensor* image_embed = nullptr ;
1822- llm->encode_image (n_threads, image_tensor, &image_embed, work_ctx);
1823- image_embeds.emplace_back (image_embed_idx, image_embed);
1824- image_embed_idx += 1 + image_embed->ne [1 ] + 6 ;
1892+ ggml_tensor* image_embed = nullptr ;
1893+ llm->encode_image (n_threads, image_tensor, &image_embed, work_ctx);
1894+ image_embeds.emplace_back (image_embed_idx, image_embed);
1895+ image_embed_idx += 1 + image_embed->ne [1 ] + 6 ;
18251896
1826- img_prompt += " Picture " + std::to_string (i + 1 ) + " : <|vision_start|>" ; // [24669, 220, index, 25, 220, 151652]
1827- int64_t num_image_tokens = image_embed->ne [1 ];
1828- img_prompt.reserve (num_image_tokens * placeholder.size ());
1829- for (int j = 0 ; j < num_image_tokens; j++) {
1830- img_prompt += placeholder;
1897+ img_prompt += " Picture " + std::to_string (i + 1 ) + " : <|vision_start|>" ; // [24669, 220, index, 25, 220, 151652]
1898+ int64_t num_image_tokens = image_embed->ne [1 ];
1899+ img_prompt.reserve (num_image_tokens * placeholder.size ());
1900+ for (int j = 0 ; j < num_image_tokens; j++) {
1901+ img_prompt += placeholder;
1902+ }
1903+ img_prompt += " <|vision_end|>" ;
18311904 }
1832- img_prompt += " <|vision_end|>" ;
1833- }
18341905
1835- prompt = " <|im_start|>system\n Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n <|im_start|>user\n " ;
1836- prompt += img_prompt;
1906+ prompt = " <|im_start|>system\n Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n <|im_start|>user\n " ;
1907+ prompt += img_prompt;
18371908
1838- prompt_attn_range.first = static_cast <int >(prompt.size ());
1839- prompt += conditioner_params.text ;
1840- prompt_attn_range.second = static_cast <int >(prompt.size ());
1909+ prompt_attn_range.first = static_cast <int >(prompt.size ());
1910+ prompt += conditioner_params.text ;
1911+ prompt_attn_range.second = static_cast <int >(prompt.size ());
18411912
1842- prompt += " <|im_end|>\n <|im_start|>assistant\n " ;
1913+ prompt += " <|im_end|>\n <|im_start|>assistant\n " ;
1914+ }
18431915 } else if (sd_version_is_flux2 (version)) {
18441916 prompt_template_encode_start_idx = 0 ;
18451917 out_layers = {10 , 20 , 30 };
0 commit comments