From b54bbd5ee5b1c4950126c7121a356c35c24c0080 Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Fri, 3 Oct 2025 03:08:23 +0800
Subject: [PATCH 1/5] Refactor loader.py to add new architecture support

---
 loader.py | 218 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 199 insertions(+), 19 deletions(-)

diff --git a/loader.py b/loader.py
index fd35e13..5b0466a 100644
--- a/loader.py
+++ b/loader.py
@@ -10,7 +10,7 @@
 from .dequant import is_quantized, dequantize_tensor
 
 IMG_ARCH_LIST = {"flux", "sd1", "sdxl", "sd3", "aura", "hidream", "cosmos", "ltxv", "hyvid", "wan", "lumina2", "qwen_image"}
-TXT_ARCH_LIST = {"t5", "t5encoder", "llama", "qwen2vl"}
+TXT_ARCH_LIST = {"t5", "t5encoder", "llama", "qwen2vl", "gemma2"}
 VIS_TYPE_LIST = {"clip-vision"}
 
 def get_orig_shape(reader, tensor_name):
@@ -170,6 +170,28 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal
     "output.weight": "lm_head.weight",
 }
 
+GEMMA2_SD_MAP = {
+    "blk.": "model.layers.",
+    # Attention
+    ".attn_q.weight": ".self_attn.q_proj.weight",
+    ".attn_k.weight": ".self_attn.k_proj.weight",
+    ".attn_v.weight": ".self_attn.v_proj.weight",
+    ".attn_output.weight": ".self_attn.o_proj.weight",
+    # LayerNorm
+    ".attn_norm.weight": ".input_layernorm.weight",
+    ".post_attention_norm.weight": ".post_attention_layernorm.weight",
+    ".post_ffw_norm.weight": ".post_feedforward_layernorm.weight",
+    ".ffn_norm.weight": ".pre_feedforward_layernorm.weight",  # Gemma2 safetensors 只有 pre_feedforward_layernorm
+    # MLP
+    ".ffn_up.weight": ".mlp.up_proj.weight",
+    ".ffn_down.weight": ".mlp.down_proj.weight",
+    ".ffn_gate.weight": ".mlp.gate_proj.weight",
+    # emb/out
+    "token_embd.weight": "model.embed_tokens.weight",
+    "output_norm.weight": "model.norm.weight",
+    "output.weight": "lm_head.weight",
+}
+
 CLIP_VISION_SD_MAP = {
     "mm.": "visual.merger.mlp.",
     "v.post_ln.": "visual.merger.ln_q.",
@@ -186,8 +208,10 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal
 def sd_map_replace(raw_sd, key_map):
     sd = {}
     for k,v in raw_sd.items():
+        orig_k = k
         for s,d in key_map.items():
-            k = k.replace(s,d)
+            if s in k:
+                k = k.replace(s,d)
         sd[k] = v
     return sd
 
@@ -278,49 +302,176 @@ def gguf_mmproj_loader(path):
 
 def gguf_tokenizer_loader(path, temb_shape):
     # convert gguf tokenizer to spiece
-    logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...")
     try:
         from sentencepiece import sentencepiece_model_pb2 as model
     except ImportError:
         raise ImportError("Please make sure sentencepiece and protobuf are installed.\npip install sentencepiece protobuf")
-    spm = model.ModelProto()
-
+    
     reader = gguf.GGUFReader(path)
+    
+    proto_tensor = None
+    try:
+        for tensor in reader.tensors:
+            if tensor.name == "tokenizer.ggml.spiece_model_raw":
+                proto_tensor = torch.from_numpy(tensor.data)
+                break
+    except Exception as e:
+        logging.warning(f"Failed to read tokenizer.ggml.spiece_model_raw tensor: {e}")
+        proto_tensor = None
+    if proto_tensor is not None:
+        try:
+            proto_bytes = proto_tensor.cpu().numpy().tobytes()
+            spm = model.ModelProto()
+            spm.ParseFromString(proto_bytes)
+            vocab_size = len(spm.pieces)
+            logging.info(f"✓ Loaded complete sentencepiece proto from GGUF tensor: {vocab_size} pieces, {len(proto_bytes)} bytes")
+            logging.info(f"  unk_id={spm.trainer_spec.unk_id}, bos_id={spm.trainer_spec.bos_id}, "
+                        f"eos_id={spm.trainer_spec.eos_id}, pad_id={spm.trainer_spec.pad_id}")
+            if temb_shape[0] != vocab_size:
+                logging.warning(f"Proto vocab_size ({vocab_size}) != embedding shape[0] ({temb_shape[0]})")
+            del reader
+            return torch.ByteTensor(list(proto_bytes))
+        except Exception as e:
+            logging.warning(f"Failed to parse proto from int8 tensor: {e}")
+            # 继续 fallback
+    # fallback: 兼容旧字符串字段
+    raw_proto_field = get_field(reader, "tokenizer.ggml.spiece_model_raw", str)
+    if raw_proto_field is not None:
+        try:
+            proto_bytes = raw_proto_field.encode('latin1')
+            spm = model.ModelProto()
+            spm.ParseFromString(proto_bytes)
+            vocab_size = len(spm.pieces)
+            logging.info(f"✓ Loaded complete sentencepiece proto from GGUF metadata: {vocab_size} pieces, "
+                        f"{len(proto_bytes)} bytes (legacy string field)")
+            logging.info(f"  unk_id={spm.trainer_spec.unk_id}, bos_id={spm.trainer_spec.bos_id}, "
+                        f"eos_id={spm.trainer_spec.eos_id}, pad_id={spm.trainer_spec.pad_id}")
+            if temb_shape[0] != vocab_size:
+                logging.warning(f"Proto vocab_size ({vocab_size}) != embedding shape[0] ({temb_shape[0]})")
+            del reader
+            return torch.ByteTensor(list(proto_bytes))
+        except Exception as e:
+            logging.warning(f"Failed to load complete proto from metadata: {e}")
+            logging.warning("Falling back to reconstructing tokenizer from GGUF fields...")
+    else:
+        logging.info("No complete sentencepiece proto found in GGUF metadata")
+        logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...")
+    
+    spm = model.ModelProto()
 
-    if get_field(reader, "tokenizer.ggml.model", str) == "t5":
+    tokenizer_model = get_field(reader, "tokenizer.ggml.model", str)
+    if tokenizer_model == "t5":
         if temb_shape == (256384, 4096): # probably UMT5
             spm.trainer_spec.model_type == 1 # Unigram (do we have a T5 w/ BPE?)
         else:
             raise NotImplementedError("Unknown model, can't set tokenizer!")
+    elif tokenizer_model == "llama":
+        # Gemma2 uses llama tokenizer model
+        if temb_shape[0] == 256000: # Gemma2_2B vocab_size
+            spm.trainer_spec.model_type = 1 # Unigram
+        else:
+            raise NotImplementedError("Unknown llama-based model, can't set tokenizer!")
     else:
-        raise NotImplementedError("Unknown model, can't set tokenizer!")
+        raise NotImplementedError(f"Unknown tokenizer model '{tokenizer_model}', can't set tokenizer!")
 
-    spm.normalizer_spec.add_dummy_prefix = get_field(reader, "tokenizer.ggml.add_space_prefix", bool)
-    spm.normalizer_spec.remove_extra_whitespaces = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool)
+    val = get_field(reader, "tokenizer.ggml.add_space_prefix", bool)
+    spm.normalizer_spec.add_dummy_prefix = val if val is not None else False
+    val = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool)
+    spm.normalizer_spec.remove_extra_whitespaces = val if val is not None else False
 
     tokens = get_list_field(reader, "tokenizer.ggml.tokens", str)
     scores = get_list_field(reader, "tokenizer.ggml.scores", float)
     toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int)
 
-    for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)):
-        # # These aren't present in the original?
-        # if toktype == 5 and idx >= temb_shape[0]%1000):
-        #     continue
+    eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int)
+    pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int)
+    unk_id = get_field(reader, "tokenizer.ggml.unknown_token_id", int)
+    bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int)
+    if unk_id is None:
+        unk_id = get_field(reader, "tokenizer.ggml.unk_token_id", int)
+
+    tokens = list(tokens)
+    scores = list(scores)
+    toktypes = list(toktypes)
+    unk_idxs = [i for i, t in enumerate(tokens) if t == "<unk>"]
+    if unk_idxs:
+        unk_id = unk_idxs[0]
+        tokens[unk_id] = "<unk>"
+        toktypes[unk_id] = 2
+        for i in reversed(unk_idxs[1:]):
+            del tokens[i]
+            del scores[i]
+            del toktypes[i]
+    else:
+        unk_id = len(tokens)
+        tokens.append("<unk>")
+        scores.append(-100.0)
+        toktypes.append(2)
+    def ensure_control(id_val, name):
+        if id_val is not None and id_val < len(tokens):
+            if tokens[id_val] == f"<{name}>" and toktypes[id_val] != 3:
+                toktypes[id_val] = 3
+    pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int)
+    bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int)
+    eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int)
+    ensure_control(pad_id, "pad")
+    ensure_control(bos_id, "bos")
+    ensure_control(eos_id, "eos")
+
+    def ensure_token(id_val, type_val, name):
+        nonlocal tokens, scores, toktypes
+        if id_val is not None and id_val >= len(tokens):
+            tokens = list(tokens) + [f"<{name}>"]
+            scores = list(scores) + [-100.0]
+            toktypes = list(toktypes) + [type_val]
+
+    ensure_token(pad_id, 0, "pad")
+    ensure_token(bos_id, 1, "bos")
+    ensure_token(eos_id, 2, "eos")
 
+    for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)):
         piece = spm.SentencePiece()
         piece.piece = token
         piece.score = score
         piece.type = toktype
         spm.pieces.append(piece)
 
-    # unsure if any of these are correct
     spm.trainer_spec.byte_fallback = True
-    spm.trainer_spec.vocab_size = len(tokens) # split off unused?
+    spm.trainer_spec.vocab_size = len(tokens)
     spm.trainer_spec.max_sentence_length = 4096
-    spm.trainer_spec.eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int)
-    spm.trainer_spec.pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int)
-
-    logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)}")
+    if eos_id is not None:
+        spm.trainer_spec.eos_id = eos_id
+    if pad_id is not None:
+        spm.trainer_spec.pad_id = pad_id
+    if unk_id is not None:
+        spm.trainer_spec.unk_id = unk_id
+    if bos_id is not None:
+        spm.trainer_spec.bos_id = bos_id
+
+    import traceback
+    try:
+        vocab_size = len(spm.pieces)
+        print(f"[GGUF DEBUG] tokenizer vocab_size: {vocab_size}")
+        print(f"[GGUF DEBUG] token_embd.weight shape: {temb_shape}")
+        if temb_shape[0] < vocab_size:
+            print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}！")
+            raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}！")
+        elif temb_shape[0] > vocab_size:
+            print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}！")
+            raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}！")
+
+        special_ids = [("unk_id", unk_id), ("pad_id", pad_id), ("bos_id", bos_id), ("eos_id", eos_id)]
+        for name, tid in special_ids:
+            print(f"[GGUF DEBUG] {name}: {tid}")
+            if tid is None or not (0 <= tid < vocab_size):
+                print(f"[GGUF ERROR] {name}={tid} 不在合法范围 0, {vocab_size}！")
+                raise RuntimeError(f"{name}={tid} 不在合法范围 0, {vocab_size}！")
+    except Exception as e:
+        print("[GGUF DEBUG] 发生异常：", e)
+        traceback.print_exc()
+        raise
+
+    logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)} (unk_id={unk_id}, pad_id={pad_id}, bos_id={bos_id}, eos_id={eos_id})")
     del reader
     return torch.ByteTensor(list(spm.SerializeToString()))
 
@@ -348,6 +499,35 @@ def gguf_clip_loader(path):
         if arch == "qwen2vl":
             vsd = gguf_mmproj_loader(path)
             sd.update(vsd)
+    elif arch == "gemma2":
+        temb_key = "token_embd.weight"
+        # Load tokenizer from GGUF metadata
+        if temb_key in sd:
+            try:
+                spm_tensor = gguf_tokenizer_loader(path, sd[temb_key].shape)
+                if spm_tensor is not None:
+                    sd["spiece_model"] = spm_tensor
+            except NotImplementedError as e:
+                logging.error(f"[Gemma2] Failed to load tokenizer: {e}")
+                raise
+            if sd[temb_key].shape[0] >= (64 * 1024):
+                # Dequantize token embeddings to prevent OOM
+                logging.warning(f"Dequantizing {temb_key} to prevent runtime OOM.")
+                sd[temb_key] = dequantize_tensor(sd[temb_key], dtype=torch.float16)
+        sd = sd_map_replace(sd, GEMMA2_SD_MAP)
+        # Gemma2_2B has 8 attention heads and 4 key-value heads
+        sd = llama_permute(sd, 8, 4)
+        fix_keys = {}
+        for k in list(sd.keys()):
+            if k.startswith("model.layers."):
+                if (
+                    ("layernorm" in k or "mlp." in k or "proj" in k)
+                    and not k.endswith(".weight")
+                    and not k.endswith(".bias")
+                ):
+                    fix_keys[k+".weight"] = sd[k]
+                    del sd[k]
+        sd.update(fix_keys)
     else:
         pass
     return sd

From 43d43f4bff22c8c03240380a066af054e8a1aef3 Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Fri, 3 Oct 2025 03:08:48 +0800
Subject: [PATCH 2/5] Add files via upload

---
 tools/convert_gemma2.py | 131 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 tools/convert_gemma2.py

diff --git a/tools/convert_gemma2.py b/tools/convert_gemma2.py
new file mode 100644
index 0000000..fc15579
--- /dev/null
+++ b/tools/convert_gemma2.py
@@ -0,0 +1,131 @@
+# (c) City96 || Apache-2.0
+# Gemma2 safetensors -> GGUF 专用转换脚本，保留全部精度和元数据
+import os
+import argparse
+import logging
+from safetensors.torch import load_file
+import torch
+import gguf
+from tqdm import tqdm
+
+# Gemma2 key映射表（safetensors -> gguf）
+KEY_MAP = {
+    # embedding
+    "model.embed_tokens.weight": "token_embd.weight",
+    # norm
+    "model.norm.weight": "output_norm.weight",
+    # spiece
+    "spiece_model": "tokenizer.ggml.spiece_model_raw",
+}
+
+# 层参数映射
+LAYER_KEY_MAP = {
+    # LayerNorm
+    "input_layernorm.weight": "attn_norm.weight",
+    "post_attention_layernorm.weight": "post_attention_norm.weight",
+    "post_feedforward_layernorm.weight": "post_ffw_norm.weight",
+    "pre_feedforward_layernorm.weight": "ffn_norm.weight",
+    # MLP
+    "mlp.down_proj.weight": "ffn_down.weight",
+    "mlp.gate_proj.weight": "ffn_gate.weight",
+    "mlp.up_proj.weight": "ffn_up.weight",
+    # Attention
+    "self_attn.k_proj.weight": "attn_k.weight",
+    "self_attn.o_proj.weight": "attn_output.weight",
+    "self_attn.q_proj.weight": "attn_q.weight",
+    "self_attn.v_proj.weight": "attn_v.weight",
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Convert Gemma2 safetensors to GGUF (保留全部精度和元数据)")
+    parser.add_argument("--src", required=True, help="源 safetensors 文件")
+    parser.add_argument("--dst", help="输出 GGUF 文件")
+    args = parser.parse_args()
+    if not os.path.isfile(args.src):
+        parser.error("输入文件不存在！")
+    return args
+
+
+def map_key(key):
+    # 直接映射
+    if key in KEY_MAP:
+        return KEY_MAP[key]
+    # 层参数映射
+    import re
+    m = re.match(r"model.layers.(\d+)\.(.+)", key)
+    if m:
+        layer_idx, subkey = m.groups()
+        if subkey in LAYER_KEY_MAP:
+            return f"blk.{layer_idx}.{LAYER_KEY_MAP[subkey]}"
+    return key  # 其他直接保留
+
+
+def main():
+    args = parse_args()
+    state_dict = load_file(args.src)
+    # 统计主精度
+    dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')]
+    main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16
+    if main_dtype == torch.float32:
+        ftype_name = "F32"
+        ftype_gguf = gguf.GGMLQuantizationType.F32
+    elif main_dtype == torch.bfloat16:
+        ftype_name = "BF16"
+        ftype_gguf = gguf.GGMLQuantizationType.BF16
+    else:
+        ftype_name = "F16"
+        ftype_gguf = gguf.GGMLQuantizationType.F16
+    dst = args.dst or f"{os.path.splitext(args.src)[0]}-{ftype_name}.gguf"
+    if os.path.isfile(dst):
+        input(f"输出文件 {dst} 已存在，按回车覆盖或 Ctrl+C 取消...")
+    writer = gguf.GGUFWriter(path=None, arch="gemma2")
+    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+    # 处理所有权重
+    for key, value in tqdm(state_dict.items()):
+        new_key = map_key(key)
+        if key == "spiece_model":
+            # 转为 int8 存储，保证 gguf 支持
+            arr = value.cpu().numpy().astype("int8")
+            writer.add_tensor(new_key, arr, raw_dtype=gguf.GGMLQuantizationType.I8)
+            tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape} bytes, int8)")
+            continue
+        if not hasattr(value, 'dtype'):
+            tqdm.write(f"跳过非张量: {key}")
+            continue
+        arr = value.cpu().numpy()
+        # 精度策略：norm 层全部 F32，embedding/attn/mlp 优先 F16
+        # norm 层 key 统一处理
+        norm_keys = [
+            "attn_norm.weight", "post_attention_norm.weight", "post_ffw_norm.weight", "ffn_norm.weight", "output_norm.weight"
+        ]
+        # embedding key
+        emb_keys = ["token_embd.weight"]
+        # 判断是否 norm 层
+        is_norm = any(new_key.endswith(nk) for nk in norm_keys)
+        is_emb = any(new_key == ek for ek in emb_keys)
+        # norm 层只有原始为 float32/bfloat16 时才保留 F32，否则保持原始 dtype
+        if is_norm:
+            if value.dtype == torch.float32 or value.dtype == torch.bfloat16:
+                qtype = gguf.GGMLQuantizationType.F32
+            elif value.dtype == torch.float16:
+                qtype = gguf.GGMLQuantizationType.F16
+            else:
+                qtype = gguf.GGMLQuantizationType.F16
+        elif is_emb:
+            qtype = gguf.GGMLQuantizationType.F16
+        elif value.dtype == torch.bfloat16:
+            qtype = gguf.GGMLQuantizationType.BF16
+        else:
+            qtype = gguf.GGMLQuantizationType.F16
+        writer.add_tensor(new_key, gguf.quants.quantize(arr, qtype), raw_dtype=qtype)
+        tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {qtype.name}, shape={arr.shape}")
+    # 写入文件
+    writer.write_header_to_file(path=dst)
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file(progress=True)
+    writer.close()
+    print(f"转换完成: {dst}")
+
+if __name__ == "__main__":
+    main()

From e366fd579b4422fc674db36a04aa67e1341c9b20 Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Fri, 3 Oct 2025 03:10:07 +0800
Subject: [PATCH 3/5] Update convert_gemma2.py

---
 tools/convert_gemma2.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/tools/convert_gemma2.py b/tools/convert_gemma2.py
index fc15579..ced9e7f 100644
--- a/tools/convert_gemma2.py
+++ b/tools/convert_gemma2.py
@@ -1,5 +1,3 @@
-# (c) City96 || Apache-2.0
-# Gemma2 safetensors -> GGUF 专用转换脚本，保留全部精度和元数据
 import os
 import argparse
 import logging
@@ -8,7 +6,6 @@
 import gguf
 from tqdm import tqdm
 
-# Gemma2 key映射表（safetensors -> gguf）
 KEY_MAP = {
     # embedding
     "model.embed_tokens.weight": "token_embd.weight",
@@ -18,7 +15,6 @@
     "spiece_model": "tokenizer.ggml.spiece_model_raw",
 }
 
-# 层参数映射
 LAYER_KEY_MAP = {
     # LayerNorm
     "input_layernorm.weight": "attn_norm.weight",
@@ -48,23 +44,20 @@ def parse_args():
 
 
 def map_key(key):
-    # 直接映射
     if key in KEY_MAP:
         return KEY_MAP[key]
-    # 层参数映射
     import re
     m = re.match(r"model.layers.(\d+)\.(.+)", key)
     if m:
         layer_idx, subkey = m.groups()
         if subkey in LAYER_KEY_MAP:
             return f"blk.{layer_idx}.{LAYER_KEY_MAP[subkey]}"
-    return key  # 其他直接保留
+    return key
 
 
 def main():
     args = parse_args()
     state_dict = load_file(args.src)
-    # 统计主精度
     dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')]
     main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16
     if main_dtype == torch.float32:
@@ -81,11 +74,9 @@ def main():
         input(f"输出文件 {dst} 已存在，按回车覆盖或 Ctrl+C 取消...")
     writer = gguf.GGUFWriter(path=None, arch="gemma2")
     writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
-    # 处理所有权重
     for key, value in tqdm(state_dict.items()):
         new_key = map_key(key)
         if key == "spiece_model":
-            # 转为 int8 存储，保证 gguf 支持
             arr = value.cpu().numpy().astype("int8")
             writer.add_tensor(new_key, arr, raw_dtype=gguf.GGMLQuantizationType.I8)
             tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape} bytes, int8)")
@@ -94,14 +85,12 @@ def main():
             tqdm.write(f"跳过非张量: {key}")
             continue
         arr = value.cpu().numpy()
-        # 精度策略：norm 层全部 F32，embedding/attn/mlp 优先 F16
-        # norm 层 key 统一处理
+        # norm 层全部 F32，embedding/attn/mlp 优先 F16
         norm_keys = [
             "attn_norm.weight", "post_attention_norm.weight", "post_ffw_norm.weight", "ffn_norm.weight", "output_norm.weight"
         ]
         # embedding key
         emb_keys = ["token_embd.weight"]
-        # 判断是否 norm 层
         is_norm = any(new_key.endswith(nk) for nk in norm_keys)
         is_emb = any(new_key == ek for ek in emb_keys)
         # norm 层只有原始为 float32/bfloat16 时才保留 F32，否则保持原始 dtype
@@ -120,7 +109,6 @@ def main():
             qtype = gguf.GGMLQuantizationType.F16
         writer.add_tensor(new_key, gguf.quants.quantize(arr, qtype), raw_dtype=qtype)
         tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {qtype.name}, shape={arr.shape}")
-    # 写入文件
     writer.write_header_to_file(path=dst)
     writer.write_kv_data_to_file()
     writer.write_tensors_to_file(progress=True)
@@ -129,3 +117,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+

From 7d233124e570f36523fcd3f183dec87008c1c701 Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Fri, 3 Oct 2025 03:58:33 +0800
Subject: [PATCH 4/5] Enhance Gemma2 conversion with quantization support

Added quantization options and improved tensor handling.
---
 tools/convert_gemma2.py | 155 ++++++++++++++++++++++++++++------------
 1 file changed, 111 insertions(+), 44 deletions(-)

diff --git a/tools/convert_gemma2.py b/tools/convert_gemma2.py
index ced9e7f..afcd884 100644
--- a/tools/convert_gemma2.py
+++ b/tools/convert_gemma2.py
@@ -6,6 +6,7 @@
 import gguf
 from tqdm import tqdm
 
+# Gemma2 key mapping
 KEY_MAP = {
     # embedding
     "model.embed_tokens.weight": "token_embd.weight",
@@ -15,6 +16,7 @@
     "spiece_model": "tokenizer.ggml.spiece_model_raw",
 }
 
+# Layer parameter mapping
 LAYER_KEY_MAP = {
     # LayerNorm
     "input_layernorm.weight": "attn_norm.weight",
@@ -34,87 +36,152 @@
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Convert Gemma2 safetensors to GGUF (保留全部精度和元数据)")
-    parser.add_argument("--src", required=True, help="源 safetensors 文件")
-    parser.add_argument("--dst", help="输出 GGUF 文件")
+    parser = argparse.ArgumentParser(description="Convert Gemma2 safetensors to GGUF with precision preservation")
+    parser.add_argument("--src", required=True, help="Source safetensors file")
+    parser.add_argument("--dst", help="Output GGUF file")
+    parser.add_argument("--quantize", "--quant", "-q", 
+                        choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q2_k", "q3_k", "q4_k", "q5_k", "q6_k"],
+                        help="Quantization type")
     args = parser.parse_args()
     if not os.path.isfile(args.src):
-        parser.error("输入文件不存在！")
+        parser.error("Input file does not exist!")
     return args
 
 
 def map_key(key):
+    # Direct mapping
     if key in KEY_MAP:
         return KEY_MAP[key]
+    # Layer parameter mapping
     import re
     m = re.match(r"model.layers.(\d+)\.(.+)", key)
     if m:
         layer_idx, subkey = m.groups()
         if subkey in LAYER_KEY_MAP:
             return f"blk.{layer_idx}.{LAYER_KEY_MAP[subkey]}"
-    return key
+    return key  # Keep others as-is
+
+
+def get_quantization_type(quant_str):
+    quant_map = {
+        "f32": gguf.GGMLQuantizationType.F32,
+        "f16": gguf.GGMLQuantizationType.F16,
+        "bf16": gguf.GGMLQuantizationType.BF16,
+        "q8_0": gguf.GGMLQuantizationType.Q8_0,
+        "q4_0": gguf.GGMLQuantizationType.Q4_0,
+        "q4_1": gguf.GGMLQuantizationType.Q4_1,
+        "q5_0": gguf.GGMLQuantizationType.Q5_0,
+        "q5_1": gguf.GGMLQuantizationType.Q5_1,
+        "q2_k": gguf.GGMLQuantizationType.Q2_K,
+        "q3_k": gguf.GGMLQuantizationType.Q3_K,
+        "q4_k": gguf.GGMLQuantizationType.Q4_K,
+        "q5_k": gguf.GGMLQuantizationType.Q5_K,
+        "q6_k": gguf.GGMLQuantizationType.Q6_K,
+    }
+    return quant_map.get(quant_str.lower())
+
+
+def should_quantize_tensor(key, quant_type):
+    """Determine if a tensor should be quantized
+    Rules:
+    - token_embd (embedding) kept at F16 (quantization severely impacts quality)
+    - norm layers kept at F32 (quantization affects stability)
+    - other weights (attn/mlp) use target quantization
+    """
+    # Embedding always kept at F16
+    if key == "token_embd.weight":
+        return False, gguf.GGMLQuantizationType.F16
+    
+    # Norm layers kept at F32
+    norm_suffixes = [
+        "attn_norm.weight", 
+        "post_attention_norm.weight", 
+        "post_ffw_norm.weight", 
+        "ffn_norm.weight", 
+        "output_norm.weight"
+    ]
+    if any(key.endswith(suffix) for suffix in norm_suffixes):
+        return False, gguf.GGMLQuantizationType.F32
+    
+    # Other layers (attn/mlp) use target quantization
+    return True, quant_type
 
 
 def main():
     args = parse_args()
     state_dict = load_file(args.src)
-    dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')]
-    main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16
-    if main_dtype == torch.float32:
-        ftype_name = "F32"
-        ftype_gguf = gguf.GGMLQuantizationType.F32
-    elif main_dtype == torch.bfloat16:
-        ftype_name = "BF16"
-        ftype_gguf = gguf.GGMLQuantizationType.BF16
+
+    if args.quantize:
+        quant_type = get_quantization_type(args.quantize)
+        ftype_name = args.quantize.upper()
     else:
-        ftype_name = "F16"
-        ftype_gguf = gguf.GGMLQuantizationType.F16
+        dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')]
+        main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16
+        if main_dtype == torch.float32:
+            ftype_name = "F32"
+            quant_type = gguf.GGMLQuantizationType.F32
+        elif main_dtype == torch.bfloat16:
+            ftype_name = "BF16"
+            quant_type = gguf.GGMLQuantizationType.BF16
+        else:
+            ftype_name = "F16"
+            quant_type = gguf.GGMLQuantizationType.F16
+    
     dst = args.dst or f"{os.path.splitext(args.src)[0]}-{ftype_name}.gguf"
     if os.path.isfile(dst):
-        input(f"输出文件 {dst} 已存在，按回车覆盖或 Ctrl+C 取消...")
+        input(f"Output file {dst} exists, press Enter to overwrite or Ctrl+C to cancel...")
+    
     writer = gguf.GGUFWriter(path=None, arch="gemma2")
     writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
-    for key, value in tqdm(state_dict.items()):
+    
+    print(f"Target quantization: {ftype_name}")
+    print(f"Output file: {dst}")
+    
+    for key, value in tqdm(state_dict.items(), desc="Converting"):
         new_key = map_key(key)
+        
+        # Special handling for spiece_model
         if key == "spiece_model":
             arr = value.cpu().numpy().astype("int8")
             writer.add_tensor(new_key, arr, raw_dtype=gguf.GGMLQuantizationType.I8)
-            tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape} bytes, int8)")
+            tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape[0]} bytes, I8)")
             continue
+        
         if not hasattr(value, 'dtype'):
-            tqdm.write(f"跳过非张量: {key}")
+            tqdm.write(f"Skipping non-tensor: {key}")
             continue
+        
         arr = value.cpu().numpy()
-        # norm 层全部 F32，embedding/attn/mlp 优先 F16
-        norm_keys = [
-            "attn_norm.weight", "post_attention_norm.weight", "post_ffw_norm.weight", "ffn_norm.weight", "output_norm.weight"
-        ]
-        # embedding key
-        emb_keys = ["token_embd.weight"]
-        is_norm = any(new_key.endswith(nk) for nk in norm_keys)
-        is_emb = any(new_key == ek for ek in emb_keys)
-        # norm 层只有原始为 float32/bfloat16 时才保留 F32，否则保持原始 dtype
-        if is_norm:
-            if value.dtype == torch.float32 or value.dtype == torch.bfloat16:
-                qtype = gguf.GGMLQuantizationType.F32
-            elif value.dtype == torch.float16:
-                qtype = gguf.GGMLQuantizationType.F16
-            else:
-                qtype = gguf.GGMLQuantizationType.F16
-        elif is_emb:
-            qtype = gguf.GGMLQuantizationType.F16
-        elif value.dtype == torch.bfloat16:
-            qtype = gguf.GGMLQuantizationType.BF16
+        
+        # Determine if quantization needed + get target precision
+        should_quant, target_qtype = should_quantize_tensor(new_key, quant_type)
+        
+        # Apply quantization or keep original precision
+        if should_quant and target_qtype not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16]:
+            quantized_arr = gguf.quants.quantize(arr, target_qtype)
+            writer.add_tensor(new_key, quantized_arr, raw_dtype=target_qtype)
+            tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {target_qtype.name}, shape={arr.shape}")
         else:
-            qtype = gguf.GGMLQuantizationType.F16
-        writer.add_tensor(new_key, gguf.quants.quantize(arr, qtype), raw_dtype=qtype)
-        tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {qtype.name}, shape={arr.shape}")
+            if target_qtype == gguf.GGMLQuantizationType.F32:
+                arr = arr.astype('float32')
+            elif target_qtype == gguf.GGMLQuantizationType.BF16:
+                # BF16 requires special handling
+                pass  # gguf.quants.quantize handles this
+            else:  # F16
+                arr = arr.astype('float16')
+            
+            quantized_arr = gguf.quants.quantize(arr, target_qtype)
+            writer.add_tensor(new_key, quantized_arr, raw_dtype=target_qtype)
+            tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {target_qtype.name}, shape={arr.shape}")
+    
+    print("Writing GGUF file...")
     writer.write_header_to_file(path=dst)
     writer.write_kv_data_to_file()
     writer.write_tensors_to_file(progress=True)
     writer.close()
-    print(f"转换完成: {dst}")
+    print(f"Conversion complete: {dst}")
+    print(f"Quantization type: {ftype_name}")
+    print(f"File size: {os.path.getsize(dst) / (1024**3):.2f} GB")
 
 if __name__ == "__main__":
     main()
-

From cc07b1ae92330b03e8216b5dc7d3a92db8564379 Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Fri, 3 Oct 2025 22:11:23 +0800
Subject: [PATCH 5/5] Update loader.py

---
 loader.py | 148 ++++--------------------------------------------------
 1 file changed, 9 insertions(+), 139 deletions(-)

diff --git a/loader.py b/loader.py
index 5b0466a..684655b 100644
--- a/loader.py
+++ b/loader.py
@@ -181,7 +181,7 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal
     ".attn_norm.weight": ".input_layernorm.weight",
     ".post_attention_norm.weight": ".post_attention_layernorm.weight",
     ".post_ffw_norm.weight": ".post_feedforward_layernorm.weight",
-    ".ffn_norm.weight": ".pre_feedforward_layernorm.weight",  # Gemma2 safetensors 只有 pre_feedforward_layernorm
+    ".ffn_norm.weight": ".pre_feedforward_layernorm.weight",  # Gemma2 safetensors only has pre_feedforward_layernorm
     # MLP
     ".ffn_up.weight": ".mlp.up_proj.weight",
     ".ffn_down.weight": ".mlp.down_proj.weight",
@@ -333,147 +333,17 @@ def gguf_tokenizer_loader(path, temb_shape):
             return torch.ByteTensor(list(proto_bytes))
         except Exception as e:
             logging.warning(f"Failed to parse proto from int8 tensor: {e}")
-            # 继续 fallback
-    # fallback: 兼容旧字符串字段
+    spiece_tensor = reader.get_tensor("tokenizer.ggml.spiece_model_raw")
+    if spiece_tensor is not None:
+        del reader
+        return spiece_tensor
     raw_proto_field = get_field(reader, "tokenizer.ggml.spiece_model_raw", str)
     if raw_proto_field is not None:
-        try:
-            proto_bytes = raw_proto_field.encode('latin1')
-            spm = model.ModelProto()
-            spm.ParseFromString(proto_bytes)
-            vocab_size = len(spm.pieces)
-            logging.info(f"✓ Loaded complete sentencepiece proto from GGUF metadata: {vocab_size} pieces, "
-                        f"{len(proto_bytes)} bytes (legacy string field)")
-            logging.info(f"  unk_id={spm.trainer_spec.unk_id}, bos_id={spm.trainer_spec.bos_id}, "
-                        f"eos_id={spm.trainer_spec.eos_id}, pad_id={spm.trainer_spec.pad_id}")
-            if temb_shape[0] != vocab_size:
-                logging.warning(f"Proto vocab_size ({vocab_size}) != embedding shape[0] ({temb_shape[0]})")
-            del reader
-            return torch.ByteTensor(list(proto_bytes))
-        except Exception as e:
-            logging.warning(f"Failed to load complete proto from metadata: {e}")
-            logging.warning("Falling back to reconstructing tokenizer from GGUF fields...")
-    else:
-        logging.info("No complete sentencepiece proto found in GGUF metadata")
-        logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...")
-    
-    spm = model.ModelProto()
-
-    tokenizer_model = get_field(reader, "tokenizer.ggml.model", str)
-    if tokenizer_model == "t5":
-        if temb_shape == (256384, 4096): # probably UMT5
-            spm.trainer_spec.model_type == 1 # Unigram (do we have a T5 w/ BPE?)
-        else:
-            raise NotImplementedError("Unknown model, can't set tokenizer!")
-    elif tokenizer_model == "llama":
-        # Gemma2 uses llama tokenizer model
-        if temb_shape[0] == 256000: # Gemma2_2B vocab_size
-            spm.trainer_spec.model_type = 1 # Unigram
-        else:
-            raise NotImplementedError("Unknown llama-based model, can't set tokenizer!")
-    else:
-        raise NotImplementedError(f"Unknown tokenizer model '{tokenizer_model}', can't set tokenizer!")
-
-    val = get_field(reader, "tokenizer.ggml.add_space_prefix", bool)
-    spm.normalizer_spec.add_dummy_prefix = val if val is not None else False
-    val = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool)
-    spm.normalizer_spec.remove_extra_whitespaces = val if val is not None else False
-
-    tokens = get_list_field(reader, "tokenizer.ggml.tokens", str)
-    scores = get_list_field(reader, "tokenizer.ggml.scores", float)
-    toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int)
-
-    eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int)
-    pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int)
-    unk_id = get_field(reader, "tokenizer.ggml.unknown_token_id", int)
-    bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int)
-    if unk_id is None:
-        unk_id = get_field(reader, "tokenizer.ggml.unk_token_id", int)
-
-    tokens = list(tokens)
-    scores = list(scores)
-    toktypes = list(toktypes)
-    unk_idxs = [i for i, t in enumerate(tokens) if t == "<unk>"]
-    if unk_idxs:
-        unk_id = unk_idxs[0]
-        tokens[unk_id] = "<unk>"
-        toktypes[unk_id] = 2
-        for i in reversed(unk_idxs[1:]):
-            del tokens[i]
-            del scores[i]
-            del toktypes[i]
-    else:
-        unk_id = len(tokens)
-        tokens.append("<unk>")
-        scores.append(-100.0)
-        toktypes.append(2)
-    def ensure_control(id_val, name):
-        if id_val is not None and id_val < len(tokens):
-            if tokens[id_val] == f"<{name}>" and toktypes[id_val] != 3:
-                toktypes[id_val] = 3
-    pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int)
-    bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int)
-    eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int)
-    ensure_control(pad_id, "pad")
-    ensure_control(bos_id, "bos")
-    ensure_control(eos_id, "eos")
-
-    def ensure_token(id_val, type_val, name):
-        nonlocal tokens, scores, toktypes
-        if id_val is not None and id_val >= len(tokens):
-            tokens = list(tokens) + [f"<{name}>"]
-            scores = list(scores) + [-100.0]
-            toktypes = list(toktypes) + [type_val]
-
-    ensure_token(pad_id, 0, "pad")
-    ensure_token(bos_id, 1, "bos")
-    ensure_token(eos_id, 2, "eos")
-
-    for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)):
-        piece = spm.SentencePiece()
-        piece.piece = token
-        piece.score = score
-        piece.type = toktype
-        spm.pieces.append(piece)
-
-    spm.trainer_spec.byte_fallback = True
-    spm.trainer_spec.vocab_size = len(tokens)
-    spm.trainer_spec.max_sentence_length = 4096
-    if eos_id is not None:
-        spm.trainer_spec.eos_id = eos_id
-    if pad_id is not None:
-        spm.trainer_spec.pad_id = pad_id
-    if unk_id is not None:
-        spm.trainer_spec.unk_id = unk_id
-    if bos_id is not None:
-        spm.trainer_spec.bos_id = bos_id
-
-    import traceback
-    try:
-        vocab_size = len(spm.pieces)
-        print(f"[GGUF DEBUG] tokenizer vocab_size: {vocab_size}")
-        print(f"[GGUF DEBUG] token_embd.weight shape: {temb_shape}")
-        if temb_shape[0] < vocab_size:
-            print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}！")
-            raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}！")
-        elif temb_shape[0] > vocab_size:
-            print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}！")
-            raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}！")
-
-        special_ids = [("unk_id", unk_id), ("pad_id", pad_id), ("bos_id", bos_id), ("eos_id", eos_id)]
-        for name, tid in special_ids:
-            print(f"[GGUF DEBUG] {name}: {tid}")
-            if tid is None or not (0 <= tid < vocab_size):
-                print(f"[GGUF ERROR] {name}={tid} 不在合法范围 0, {vocab_size}！")
-                raise RuntimeError(f"{name}={tid} 不在合法范围 0, {vocab_size}！")
-    except Exception as e:
-        print("[GGUF DEBUG] 发生异常：", e)
-        traceback.print_exc()
-        raise
-
-    logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)} (unk_id={unk_id}, pad_id={pad_id}, bos_id={bos_id}, eos_id={eos_id})")
+        proto_bytes = raw_proto_field.encode('latin1')
+        del reader
+        return torch.ByteTensor(list(proto_bytes))
     del reader
-    return torch.ByteTensor(list(spm.SerializeToString()))
+    raise NotImplementedError("No sentencepiece proto found in GGUF metadata!")
 
 def gguf_clip_loader(path):
     sd, arch = gguf_sd_loader(path, return_arch=True, is_text_model=True)