@@ -1678,6 +1678,19 @@ struct LLMEmbedder : public Conditioner {
16781678 }
16791679 }
16801680
1681+ size_t get_utf8_char_len (char c) {
1682+ unsigned char uc = static_cast <unsigned char >(c);
1683+ if ((uc & 0x80 ) == 0 )
1684+ return 1 ; // ASCII (1 byte)
1685+ if ((uc & 0xE0 ) == 0xC0 )
1686+ return 2 ; // 2-byte char
1687+ if ((uc & 0xF0 ) == 0xE0 )
1688+ return 3 ; // 3-byte char (Common for Chinese/Japanese)
1689+ if ((uc & 0xF8 ) == 0xF0 )
1690+ return 4 ; // 4-byte char (Emojis, etc.)
1691+ return 1 ; // Fallback (should not happen in valid UTF-8)
1692+ }
1693+
16811694 std::tuple<std::vector<int >, std::vector<float >> tokenize (
16821695 std::string text,
16831696 std::pair<int , int > attn_range,
@@ -1697,16 +1710,6 @@ struct LLMEmbedder : public Conditioner {
16971710 }
16981711 parsed_attention.emplace_back (text.substr (attn_range.second ), 1 .f );
16991712
1700- // {
1701- // std::stringstream ss;
1702- // ss << '[';
1703- // for (const auto& item : parsed_attention) {
1704- // ss << "['" << item.first << "', " << item.second << "], ";
1705- // }
1706- // ss << ']';
1707- // LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
1708- // }
1709-
17101713 std::vector<int > tokens;
17111714 std::vector<float > weights;
17121715
@@ -1715,46 +1718,47 @@ struct LLMEmbedder : public Conditioner {
17151718 float curr_weight = item.second ;
17161719
17171720 if (spell_quotes) {
1718- std::vector<std:: string> parts ;
1721+ std::string buffer ;
17191722 bool in_quote = false ;
1720- std::string current_part;
17211723
1722- for (char c : curr_text) {
1723- if (c == ' "' ) {
1724- if (!current_part.empty ()) {
1725- parts.push_back (current_part);
1726- current_part.clear ();
1724+ size_t i = 0 ;
1725+ while (i < curr_text.size ()) {
1726+ // utf8 character can be 1-4 char
1727+ size_t char_len = get_utf8_char_len (curr_text[i]);
1728+
1729+ // Safety check to prevent reading past end of string
1730+ if (i + char_len > curr_text.size ()) {
1731+ char_len = curr_text.size () - i;
1732+ }
1733+ std::string uchar = curr_text.substr (i, char_len);
1734+ i += char_len;
1735+
1736+ if (uchar == " \" " ) {
1737+ buffer += uchar;
1738+ // If we were accumulating normal text, flush it now
1739+ if (!in_quote) {
1740+ std::vector<int > part_tokens = tokenizer->tokenize (buffer, nullptr );
1741+ tokens.insert (tokens.end (), part_tokens.begin (), part_tokens.end ());
1742+ weights.insert (weights.end (), part_tokens.size (), curr_weight);
1743+ buffer.clear ();
17271744 }
17281745 in_quote = !in_quote;
17291746 } else {
1730- current_part += c;
1731- if (in_quote && current_part.size () == 1 ) {
1732- parts.push_back (current_part);
1733- current_part.clear ();
1734- }
1735- }
1736- }
1737- if (!current_part.empty ()) {
1738- parts.push_back (current_part);
1739- }
1740-
1741- for (const auto & part : parts) {
1742- if (part.empty ())
1743- continue ;
1744- if (part[0 ] == ' "' && part.back () == ' "' ) {
1745- std::string quoted_content = part.substr (1 , part.size () - 2 );
1746- for (char ch : quoted_content) {
1747- std::string char_str (1 , ch);
1748- std::vector<int > char_tokens = tokenizer->tokenize (char_str, nullptr );
1747+ if (in_quote) {
1748+ std::vector<int > char_tokens = tokenizer->tokenize (uchar, nullptr );
17491749 tokens.insert (tokens.end (), char_tokens.begin (), char_tokens.end ());
17501750 weights.insert (weights.end (), char_tokens.size (), curr_weight);
1751+ } else {
1752+ buffer += uchar;
17511753 }
1752- } else {
1753- std::vector<int > part_tokens = tokenizer->tokenize (part, nullptr );
1754- tokens.insert (tokens.end (), part_tokens.begin (), part_tokens.end ());
1755- weights.insert (weights.end (), part_tokens.size (), curr_weight);
17561754 }
17571755 }
1756+
1757+ if (!buffer.empty ()) {
1758+ std::vector<int > part_tokens = tokenizer->tokenize (buffer, nullptr );
1759+ tokens.insert (tokens.end (), part_tokens.begin (), part_tokens.end ());
1760+ weights.insert (weights.end (), part_tokens.size (), curr_weight);
1761+ }
17581762 } else {
17591763 std::vector<int > curr_tokens = tokenizer->tokenize (curr_text, nullptr );
17601764 tokens.insert (tokens.end (), curr_tokens.begin (), curr_tokens.end ());
@@ -1782,14 +1786,13 @@ struct LLMEmbedder : public Conditioner {
17821786 LOG_INFO (" LongCatEditPipeline" );
17831787 prompt_template_encode_start_idx = 67 ;
17841788 // prompt_template_encode_end_idx = 5;
1785- int image_embed_idx = 36 + 6 ;
1789+ int image_embed_idx = 36 + 6 ;
17861790
17871791 int min_pixels = 384 * 384 ;
17881792 int max_pixels = 560 * 560 ;
17891793 std::string placeholder = " <|image_pad|>" ;
17901794 std::string img_prompt;
17911795
1792-
17931796 // Only one image is officicially supported by the model, not sure how it handles multiple images
17941797 for (int i = 0 ; i < conditioner_params.ref_images .size (); i++) {
17951798 sd_image_f32_t image = sd_image_t_to_sd_image_f32_t (*conditioner_params.ref_images [i]);
0 commit comments