From b54bbd5ee5b1c4950126c7121a356c35c24c0080 Mon Sep 17 00:00:00 2001 From: spawner Date: Fri, 3 Oct 2025 03:08:23 +0800 Subject: [PATCH 1/5] Refactor loader.py to add new architecture support --- loader.py | 218 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 199 insertions(+), 19 deletions(-) diff --git a/loader.py b/loader.py index fd35e13..5b0466a 100644 --- a/loader.py +++ b/loader.py @@ -10,7 +10,7 @@ from .dequant import is_quantized, dequantize_tensor IMG_ARCH_LIST = {"flux", "sd1", "sdxl", "sd3", "aura", "hidream", "cosmos", "ltxv", "hyvid", "wan", "lumina2", "qwen_image"} -TXT_ARCH_LIST = {"t5", "t5encoder", "llama", "qwen2vl"} +TXT_ARCH_LIST = {"t5", "t5encoder", "llama", "qwen2vl", "gemma2"} VIS_TYPE_LIST = {"clip-vision"} def get_orig_shape(reader, tensor_name): @@ -170,6 +170,28 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal "output.weight": "lm_head.weight", } +GEMMA2_SD_MAP = { + "blk.": "model.layers.", + # Attention + ".attn_q.weight": ".self_attn.q_proj.weight", + ".attn_k.weight": ".self_attn.k_proj.weight", + ".attn_v.weight": ".self_attn.v_proj.weight", + ".attn_output.weight": ".self_attn.o_proj.weight", + # LayerNorm + ".attn_norm.weight": ".input_layernorm.weight", + ".post_attention_norm.weight": ".post_attention_layernorm.weight", + ".post_ffw_norm.weight": ".post_feedforward_layernorm.weight", + ".ffn_norm.weight": ".pre_feedforward_layernorm.weight", # Gemma2 safetensors 只有 pre_feedforward_layernorm + # MLP + ".ffn_up.weight": ".mlp.up_proj.weight", + ".ffn_down.weight": ".mlp.down_proj.weight", + ".ffn_gate.weight": ".mlp.gate_proj.weight", + # emb/out + "token_embd.weight": "model.embed_tokens.weight", + "output_norm.weight": "model.norm.weight", + "output.weight": "lm_head.weight", +} + CLIP_VISION_SD_MAP = { "mm.": "visual.merger.mlp.", "v.post_ln.": "visual.merger.ln_q.", @@ -186,8 +208,10 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal def sd_map_replace(raw_sd, key_map): sd = {} for k,v in raw_sd.items(): + orig_k = k for s,d in key_map.items(): - k = k.replace(s,d) + if s in k: + k = k.replace(s,d) sd[k] = v return sd @@ -278,49 +302,176 @@ def gguf_mmproj_loader(path): def gguf_tokenizer_loader(path, temb_shape): # convert gguf tokenizer to spiece - logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...") try: from sentencepiece import sentencepiece_model_pb2 as model except ImportError: raise ImportError("Please make sure sentencepiece and protobuf are installed.\npip install sentencepiece protobuf") - spm = model.ModelProto() - + reader = gguf.GGUFReader(path) + + proto_tensor = None + try: + for tensor in reader.tensors: + if tensor.name == "tokenizer.ggml.spiece_model_raw": + proto_tensor = torch.from_numpy(tensor.data) + break + except Exception as e: + logging.warning(f"Failed to read tokenizer.ggml.spiece_model_raw tensor: {e}") + proto_tensor = None + if proto_tensor is not None: + try: + proto_bytes = proto_tensor.cpu().numpy().tobytes() + spm = model.ModelProto() + spm.ParseFromString(proto_bytes) + vocab_size = len(spm.pieces) + logging.info(f"✓ Loaded complete sentencepiece proto from GGUF tensor: {vocab_size} pieces, {len(proto_bytes)} bytes") + logging.info(f" unk_id={spm.trainer_spec.unk_id}, bos_id={spm.trainer_spec.bos_id}, " + f"eos_id={spm.trainer_spec.eos_id}, pad_id={spm.trainer_spec.pad_id}") + if temb_shape[0] != vocab_size: + logging.warning(f"Proto vocab_size ({vocab_size}) != embedding shape[0] ({temb_shape[0]})") + del reader + return torch.ByteTensor(list(proto_bytes)) + except Exception as e: + logging.warning(f"Failed to parse proto from int8 tensor: {e}") + # 继续 fallback + # fallback: 兼容旧字符串字段 + raw_proto_field = get_field(reader, "tokenizer.ggml.spiece_model_raw", str) + if raw_proto_field is not None: + try: + proto_bytes = raw_proto_field.encode('latin1') + spm = model.ModelProto() + spm.ParseFromString(proto_bytes) + vocab_size = len(spm.pieces) + logging.info(f"✓ Loaded complete sentencepiece proto from GGUF metadata: {vocab_size} pieces, " + f"{len(proto_bytes)} bytes (legacy string field)") + logging.info(f" unk_id={spm.trainer_spec.unk_id}, bos_id={spm.trainer_spec.bos_id}, " + f"eos_id={spm.trainer_spec.eos_id}, pad_id={spm.trainer_spec.pad_id}") + if temb_shape[0] != vocab_size: + logging.warning(f"Proto vocab_size ({vocab_size}) != embedding shape[0] ({temb_shape[0]})") + del reader + return torch.ByteTensor(list(proto_bytes)) + except Exception as e: + logging.warning(f"Failed to load complete proto from metadata: {e}") + logging.warning("Falling back to reconstructing tokenizer from GGUF fields...") + else: + logging.info("No complete sentencepiece proto found in GGUF metadata") + logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...") + + spm = model.ModelProto() - if get_field(reader, "tokenizer.ggml.model", str) == "t5": + tokenizer_model = get_field(reader, "tokenizer.ggml.model", str) + if tokenizer_model == "t5": if temb_shape == (256384, 4096): # probably UMT5 spm.trainer_spec.model_type == 1 # Unigram (do we have a T5 w/ BPE?) else: raise NotImplementedError("Unknown model, can't set tokenizer!") + elif tokenizer_model == "llama": + # Gemma2 uses llama tokenizer model + if temb_shape[0] == 256000: # Gemma2_2B vocab_size + spm.trainer_spec.model_type = 1 # Unigram + else: + raise NotImplementedError("Unknown llama-based model, can't set tokenizer!") else: - raise NotImplementedError("Unknown model, can't set tokenizer!") + raise NotImplementedError(f"Unknown tokenizer model '{tokenizer_model}', can't set tokenizer!") - spm.normalizer_spec.add_dummy_prefix = get_field(reader, "tokenizer.ggml.add_space_prefix", bool) - spm.normalizer_spec.remove_extra_whitespaces = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool) + val = get_field(reader, "tokenizer.ggml.add_space_prefix", bool) + spm.normalizer_spec.add_dummy_prefix = val if val is not None else False + val = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool) + spm.normalizer_spec.remove_extra_whitespaces = val if val is not None else False tokens = get_list_field(reader, "tokenizer.ggml.tokens", str) scores = get_list_field(reader, "tokenizer.ggml.scores", float) toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int) - for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)): - # # These aren't present in the original? - # if toktype == 5 and idx >= temb_shape[0]%1000): - # continue + eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int) + pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int) + unk_id = get_field(reader, "tokenizer.ggml.unknown_token_id", int) + bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int) + if unk_id is None: + unk_id = get_field(reader, "tokenizer.ggml.unk_token_id", int) + + tokens = list(tokens) + scores = list(scores) + toktypes = list(toktypes) + unk_idxs = [i for i, t in enumerate(tokens) if t == ""] + if unk_idxs: + unk_id = unk_idxs[0] + tokens[unk_id] = "" + toktypes[unk_id] = 2 + for i in reversed(unk_idxs[1:]): + del tokens[i] + del scores[i] + del toktypes[i] + else: + unk_id = len(tokens) + tokens.append("") + scores.append(-100.0) + toktypes.append(2) + def ensure_control(id_val, name): + if id_val is not None and id_val < len(tokens): + if tokens[id_val] == f"<{name}>" and toktypes[id_val] != 3: + toktypes[id_val] = 3 + pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int) + bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int) + eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int) + ensure_control(pad_id, "pad") + ensure_control(bos_id, "bos") + ensure_control(eos_id, "eos") + + def ensure_token(id_val, type_val, name): + nonlocal tokens, scores, toktypes + if id_val is not None and id_val >= len(tokens): + tokens = list(tokens) + [f"<{name}>"] + scores = list(scores) + [-100.0] + toktypes = list(toktypes) + [type_val] + + ensure_token(pad_id, 0, "pad") + ensure_token(bos_id, 1, "bos") + ensure_token(eos_id, 2, "eos") + for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)): piece = spm.SentencePiece() piece.piece = token piece.score = score piece.type = toktype spm.pieces.append(piece) - # unsure if any of these are correct spm.trainer_spec.byte_fallback = True - spm.trainer_spec.vocab_size = len(tokens) # split off unused? + spm.trainer_spec.vocab_size = len(tokens) spm.trainer_spec.max_sentence_length = 4096 - spm.trainer_spec.eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int) - spm.trainer_spec.pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int) - - logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)}") + if eos_id is not None: + spm.trainer_spec.eos_id = eos_id + if pad_id is not None: + spm.trainer_spec.pad_id = pad_id + if unk_id is not None: + spm.trainer_spec.unk_id = unk_id + if bos_id is not None: + spm.trainer_spec.bos_id = bos_id + + import traceback + try: + vocab_size = len(spm.pieces) + print(f"[GGUF DEBUG] tokenizer vocab_size: {vocab_size}") + print(f"[GGUF DEBUG] token_embd.weight shape: {temb_shape}") + if temb_shape[0] < vocab_size: + print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}!") + raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}!") + elif temb_shape[0] > vocab_size: + print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}!") + raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}!") + + special_ids = [("unk_id", unk_id), ("pad_id", pad_id), ("bos_id", bos_id), ("eos_id", eos_id)] + for name, tid in special_ids: + print(f"[GGUF DEBUG] {name}: {tid}") + if tid is None or not (0 <= tid < vocab_size): + print(f"[GGUF ERROR] {name}={tid} 不在合法范围 0, {vocab_size}!") + raise RuntimeError(f"{name}={tid} 不在合法范围 0, {vocab_size}!") + except Exception as e: + print("[GGUF DEBUG] 发生异常:", e) + traceback.print_exc() + raise + + logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)} (unk_id={unk_id}, pad_id={pad_id}, bos_id={bos_id}, eos_id={eos_id})") del reader return torch.ByteTensor(list(spm.SerializeToString())) @@ -348,6 +499,35 @@ def gguf_clip_loader(path): if arch == "qwen2vl": vsd = gguf_mmproj_loader(path) sd.update(vsd) + elif arch == "gemma2": + temb_key = "token_embd.weight" + # Load tokenizer from GGUF metadata + if temb_key in sd: + try: + spm_tensor = gguf_tokenizer_loader(path, sd[temb_key].shape) + if spm_tensor is not None: + sd["spiece_model"] = spm_tensor + except NotImplementedError as e: + logging.error(f"[Gemma2] Failed to load tokenizer: {e}") + raise + if sd[temb_key].shape[0] >= (64 * 1024): + # Dequantize token embeddings to prevent OOM + logging.warning(f"Dequantizing {temb_key} to prevent runtime OOM.") + sd[temb_key] = dequantize_tensor(sd[temb_key], dtype=torch.float16) + sd = sd_map_replace(sd, GEMMA2_SD_MAP) + # Gemma2_2B has 8 attention heads and 4 key-value heads + sd = llama_permute(sd, 8, 4) + fix_keys = {} + for k in list(sd.keys()): + if k.startswith("model.layers."): + if ( + ("layernorm" in k or "mlp." in k or "proj" in k) + and not k.endswith(".weight") + and not k.endswith(".bias") + ): + fix_keys[k+".weight"] = sd[k] + del sd[k] + sd.update(fix_keys) else: pass return sd From 43d43f4bff22c8c03240380a066af054e8a1aef3 Mon Sep 17 00:00:00 2001 From: spawner Date: Fri, 3 Oct 2025 03:08:48 +0800 Subject: [PATCH 2/5] Add files via upload --- tools/convert_gemma2.py | 131 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 tools/convert_gemma2.py diff --git a/tools/convert_gemma2.py b/tools/convert_gemma2.py new file mode 100644 index 0000000..fc15579 --- /dev/null +++ b/tools/convert_gemma2.py @@ -0,0 +1,131 @@ +# (c) City96 || Apache-2.0 +# Gemma2 safetensors -> GGUF 专用转换脚本,保留全部精度和元数据 +import os +import argparse +import logging +from safetensors.torch import load_file +import torch +import gguf +from tqdm import tqdm + +# Gemma2 key映射表(safetensors -> gguf) +KEY_MAP = { + # embedding + "model.embed_tokens.weight": "token_embd.weight", + # norm + "model.norm.weight": "output_norm.weight", + # spiece + "spiece_model": "tokenizer.ggml.spiece_model_raw", +} + +# 层参数映射 +LAYER_KEY_MAP = { + # LayerNorm + "input_layernorm.weight": "attn_norm.weight", + "post_attention_layernorm.weight": "post_attention_norm.weight", + "post_feedforward_layernorm.weight": "post_ffw_norm.weight", + "pre_feedforward_layernorm.weight": "ffn_norm.weight", + # MLP + "mlp.down_proj.weight": "ffn_down.weight", + "mlp.gate_proj.weight": "ffn_gate.weight", + "mlp.up_proj.weight": "ffn_up.weight", + # Attention + "self_attn.k_proj.weight": "attn_k.weight", + "self_attn.o_proj.weight": "attn_output.weight", + "self_attn.q_proj.weight": "attn_q.weight", + "self_attn.v_proj.weight": "attn_v.weight", +} + + +def parse_args(): + parser = argparse.ArgumentParser(description="Convert Gemma2 safetensors to GGUF (保留全部精度和元数据)") + parser.add_argument("--src", required=True, help="源 safetensors 文件") + parser.add_argument("--dst", help="输出 GGUF 文件") + args = parser.parse_args() + if not os.path.isfile(args.src): + parser.error("输入文件不存在!") + return args + + +def map_key(key): + # 直接映射 + if key in KEY_MAP: + return KEY_MAP[key] + # 层参数映射 + import re + m = re.match(r"model.layers.(\d+)\.(.+)", key) + if m: + layer_idx, subkey = m.groups() + if subkey in LAYER_KEY_MAP: + return f"blk.{layer_idx}.{LAYER_KEY_MAP[subkey]}" + return key # 其他直接保留 + + +def main(): + args = parse_args() + state_dict = load_file(args.src) + # 统计主精度 + dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')] + main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16 + if main_dtype == torch.float32: + ftype_name = "F32" + ftype_gguf = gguf.GGMLQuantizationType.F32 + elif main_dtype == torch.bfloat16: + ftype_name = "BF16" + ftype_gguf = gguf.GGMLQuantizationType.BF16 + else: + ftype_name = "F16" + ftype_gguf = gguf.GGMLQuantizationType.F16 + dst = args.dst or f"{os.path.splitext(args.src)[0]}-{ftype_name}.gguf" + if os.path.isfile(dst): + input(f"输出文件 {dst} 已存在,按回车覆盖或 Ctrl+C 取消...") + writer = gguf.GGUFWriter(path=None, arch="gemma2") + writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + # 处理所有权重 + for key, value in tqdm(state_dict.items()): + new_key = map_key(key) + if key == "spiece_model": + # 转为 int8 存储,保证 gguf 支持 + arr = value.cpu().numpy().astype("int8") + writer.add_tensor(new_key, arr, raw_dtype=gguf.GGMLQuantizationType.I8) + tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape} bytes, int8)") + continue + if not hasattr(value, 'dtype'): + tqdm.write(f"跳过非张量: {key}") + continue + arr = value.cpu().numpy() + # 精度策略:norm 层全部 F32,embedding/attn/mlp 优先 F16 + # norm 层 key 统一处理 + norm_keys = [ + "attn_norm.weight", "post_attention_norm.weight", "post_ffw_norm.weight", "ffn_norm.weight", "output_norm.weight" + ] + # embedding key + emb_keys = ["token_embd.weight"] + # 判断是否 norm 层 + is_norm = any(new_key.endswith(nk) for nk in norm_keys) + is_emb = any(new_key == ek for ek in emb_keys) + # norm 层只有原始为 float32/bfloat16 时才保留 F32,否则保持原始 dtype + if is_norm: + if value.dtype == torch.float32 or value.dtype == torch.bfloat16: + qtype = gguf.GGMLQuantizationType.F32 + elif value.dtype == torch.float16: + qtype = gguf.GGMLQuantizationType.F16 + else: + qtype = gguf.GGMLQuantizationType.F16 + elif is_emb: + qtype = gguf.GGMLQuantizationType.F16 + elif value.dtype == torch.bfloat16: + qtype = gguf.GGMLQuantizationType.BF16 + else: + qtype = gguf.GGMLQuantizationType.F16 + writer.add_tensor(new_key, gguf.quants.quantize(arr, qtype), raw_dtype=qtype) + tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {qtype.name}, shape={arr.shape}") + # 写入文件 + writer.write_header_to_file(path=dst) + writer.write_kv_data_to_file() + writer.write_tensors_to_file(progress=True) + writer.close() + print(f"转换完成: {dst}") + +if __name__ == "__main__": + main() From e366fd579b4422fc674db36a04aa67e1341c9b20 Mon Sep 17 00:00:00 2001 From: spawner Date: Fri, 3 Oct 2025 03:10:07 +0800 Subject: [PATCH 3/5] Update convert_gemma2.py --- tools/convert_gemma2.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/tools/convert_gemma2.py b/tools/convert_gemma2.py index fc15579..ced9e7f 100644 --- a/tools/convert_gemma2.py +++ b/tools/convert_gemma2.py @@ -1,5 +1,3 @@ -# (c) City96 || Apache-2.0 -# Gemma2 safetensors -> GGUF 专用转换脚本,保留全部精度和元数据 import os import argparse import logging @@ -8,7 +6,6 @@ import gguf from tqdm import tqdm -# Gemma2 key映射表(safetensors -> gguf) KEY_MAP = { # embedding "model.embed_tokens.weight": "token_embd.weight", @@ -18,7 +15,6 @@ "spiece_model": "tokenizer.ggml.spiece_model_raw", } -# 层参数映射 LAYER_KEY_MAP = { # LayerNorm "input_layernorm.weight": "attn_norm.weight", @@ -48,23 +44,20 @@ def parse_args(): def map_key(key): - # 直接映射 if key in KEY_MAP: return KEY_MAP[key] - # 层参数映射 import re m = re.match(r"model.layers.(\d+)\.(.+)", key) if m: layer_idx, subkey = m.groups() if subkey in LAYER_KEY_MAP: return f"blk.{layer_idx}.{LAYER_KEY_MAP[subkey]}" - return key # 其他直接保留 + return key def main(): args = parse_args() state_dict = load_file(args.src) - # 统计主精度 dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')] main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16 if main_dtype == torch.float32: @@ -81,11 +74,9 @@ def main(): input(f"输出文件 {dst} 已存在,按回车覆盖或 Ctrl+C 取消...") writer = gguf.GGUFWriter(path=None, arch="gemma2") writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - # 处理所有权重 for key, value in tqdm(state_dict.items()): new_key = map_key(key) if key == "spiece_model": - # 转为 int8 存储,保证 gguf 支持 arr = value.cpu().numpy().astype("int8") writer.add_tensor(new_key, arr, raw_dtype=gguf.GGMLQuantizationType.I8) tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape} bytes, int8)") @@ -94,14 +85,12 @@ def main(): tqdm.write(f"跳过非张量: {key}") continue arr = value.cpu().numpy() - # 精度策略:norm 层全部 F32,embedding/attn/mlp 优先 F16 - # norm 层 key 统一处理 + # norm 层全部 F32,embedding/attn/mlp 优先 F16 norm_keys = [ "attn_norm.weight", "post_attention_norm.weight", "post_ffw_norm.weight", "ffn_norm.weight", "output_norm.weight" ] # embedding key emb_keys = ["token_embd.weight"] - # 判断是否 norm 层 is_norm = any(new_key.endswith(nk) for nk in norm_keys) is_emb = any(new_key == ek for ek in emb_keys) # norm 层只有原始为 float32/bfloat16 时才保留 F32,否则保持原始 dtype @@ -120,7 +109,6 @@ def main(): qtype = gguf.GGMLQuantizationType.F16 writer.add_tensor(new_key, gguf.quants.quantize(arr, qtype), raw_dtype=qtype) tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {qtype.name}, shape={arr.shape}") - # 写入文件 writer.write_header_to_file(path=dst) writer.write_kv_data_to_file() writer.write_tensors_to_file(progress=True) @@ -129,3 +117,4 @@ def main(): if __name__ == "__main__": main() + From 7d233124e570f36523fcd3f183dec87008c1c701 Mon Sep 17 00:00:00 2001 From: spawner Date: Fri, 3 Oct 2025 03:58:33 +0800 Subject: [PATCH 4/5] Enhance Gemma2 conversion with quantization support Added quantization options and improved tensor handling. --- tools/convert_gemma2.py | 155 ++++++++++++++++++++++++++++------------ 1 file changed, 111 insertions(+), 44 deletions(-) diff --git a/tools/convert_gemma2.py b/tools/convert_gemma2.py index ced9e7f..afcd884 100644 --- a/tools/convert_gemma2.py +++ b/tools/convert_gemma2.py @@ -6,6 +6,7 @@ import gguf from tqdm import tqdm +# Gemma2 key mapping KEY_MAP = { # embedding "model.embed_tokens.weight": "token_embd.weight", @@ -15,6 +16,7 @@ "spiece_model": "tokenizer.ggml.spiece_model_raw", } +# Layer parameter mapping LAYER_KEY_MAP = { # LayerNorm "input_layernorm.weight": "attn_norm.weight", @@ -34,87 +36,152 @@ def parse_args(): - parser = argparse.ArgumentParser(description="Convert Gemma2 safetensors to GGUF (保留全部精度和元数据)") - parser.add_argument("--src", required=True, help="源 safetensors 文件") - parser.add_argument("--dst", help="输出 GGUF 文件") + parser = argparse.ArgumentParser(description="Convert Gemma2 safetensors to GGUF with precision preservation") + parser.add_argument("--src", required=True, help="Source safetensors file") + parser.add_argument("--dst", help="Output GGUF file") + parser.add_argument("--quantize", "--quant", "-q", + choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q2_k", "q3_k", "q4_k", "q5_k", "q6_k"], + help="Quantization type") args = parser.parse_args() if not os.path.isfile(args.src): - parser.error("输入文件不存在!") + parser.error("Input file does not exist!") return args def map_key(key): + # Direct mapping if key in KEY_MAP: return KEY_MAP[key] + # Layer parameter mapping import re m = re.match(r"model.layers.(\d+)\.(.+)", key) if m: layer_idx, subkey = m.groups() if subkey in LAYER_KEY_MAP: return f"blk.{layer_idx}.{LAYER_KEY_MAP[subkey]}" - return key + return key # Keep others as-is + + +def get_quantization_type(quant_str): + quant_map = { + "f32": gguf.GGMLQuantizationType.F32, + "f16": gguf.GGMLQuantizationType.F16, + "bf16": gguf.GGMLQuantizationType.BF16, + "q8_0": gguf.GGMLQuantizationType.Q8_0, + "q4_0": gguf.GGMLQuantizationType.Q4_0, + "q4_1": gguf.GGMLQuantizationType.Q4_1, + "q5_0": gguf.GGMLQuantizationType.Q5_0, + "q5_1": gguf.GGMLQuantizationType.Q5_1, + "q2_k": gguf.GGMLQuantizationType.Q2_K, + "q3_k": gguf.GGMLQuantizationType.Q3_K, + "q4_k": gguf.GGMLQuantizationType.Q4_K, + "q5_k": gguf.GGMLQuantizationType.Q5_K, + "q6_k": gguf.GGMLQuantizationType.Q6_K, + } + return quant_map.get(quant_str.lower()) + + +def should_quantize_tensor(key, quant_type): + """Determine if a tensor should be quantized + Rules: + - token_embd (embedding) kept at F16 (quantization severely impacts quality) + - norm layers kept at F32 (quantization affects stability) + - other weights (attn/mlp) use target quantization + """ + # Embedding always kept at F16 + if key == "token_embd.weight": + return False, gguf.GGMLQuantizationType.F16 + + # Norm layers kept at F32 + norm_suffixes = [ + "attn_norm.weight", + "post_attention_norm.weight", + "post_ffw_norm.weight", + "ffn_norm.weight", + "output_norm.weight" + ] + if any(key.endswith(suffix) for suffix in norm_suffixes): + return False, gguf.GGMLQuantizationType.F32 + + # Other layers (attn/mlp) use target quantization + return True, quant_type def main(): args = parse_args() state_dict = load_file(args.src) - dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')] - main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16 - if main_dtype == torch.float32: - ftype_name = "F32" - ftype_gguf = gguf.GGMLQuantizationType.F32 - elif main_dtype == torch.bfloat16: - ftype_name = "BF16" - ftype_gguf = gguf.GGMLQuantizationType.BF16 + + if args.quantize: + quant_type = get_quantization_type(args.quantize) + ftype_name = args.quantize.upper() else: - ftype_name = "F16" - ftype_gguf = gguf.GGMLQuantizationType.F16 + dtypes = [v.dtype for v in state_dict.values() if hasattr(v, 'dtype')] + main_dtype = max(set(dtypes), key=dtypes.count) if dtypes else torch.float16 + if main_dtype == torch.float32: + ftype_name = "F32" + quant_type = gguf.GGMLQuantizationType.F32 + elif main_dtype == torch.bfloat16: + ftype_name = "BF16" + quant_type = gguf.GGMLQuantizationType.BF16 + else: + ftype_name = "F16" + quant_type = gguf.GGMLQuantizationType.F16 + dst = args.dst or f"{os.path.splitext(args.src)[0]}-{ftype_name}.gguf" if os.path.isfile(dst): - input(f"输出文件 {dst} 已存在,按回车覆盖或 Ctrl+C 取消...") + input(f"Output file {dst} exists, press Enter to overwrite or Ctrl+C to cancel...") + writer = gguf.GGUFWriter(path=None, arch="gemma2") writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - for key, value in tqdm(state_dict.items()): + + print(f"Target quantization: {ftype_name}") + print(f"Output file: {dst}") + + for key, value in tqdm(state_dict.items(), desc="Converting"): new_key = map_key(key) + + # Special handling for spiece_model if key == "spiece_model": arr = value.cpu().numpy().astype("int8") writer.add_tensor(new_key, arr, raw_dtype=gguf.GGMLQuantizationType.I8) - tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape} bytes, int8)") + tqdm.write(f"{key} -> {new_key} (spiece_model, {arr.shape[0]} bytes, I8)") continue + if not hasattr(value, 'dtype'): - tqdm.write(f"跳过非张量: {key}") + tqdm.write(f"Skipping non-tensor: {key}") continue + arr = value.cpu().numpy() - # norm 层全部 F32,embedding/attn/mlp 优先 F16 - norm_keys = [ - "attn_norm.weight", "post_attention_norm.weight", "post_ffw_norm.weight", "ffn_norm.weight", "output_norm.weight" - ] - # embedding key - emb_keys = ["token_embd.weight"] - is_norm = any(new_key.endswith(nk) for nk in norm_keys) - is_emb = any(new_key == ek for ek in emb_keys) - # norm 层只有原始为 float32/bfloat16 时才保留 F32,否则保持原始 dtype - if is_norm: - if value.dtype == torch.float32 or value.dtype == torch.bfloat16: - qtype = gguf.GGMLQuantizationType.F32 - elif value.dtype == torch.float16: - qtype = gguf.GGMLQuantizationType.F16 - else: - qtype = gguf.GGMLQuantizationType.F16 - elif is_emb: - qtype = gguf.GGMLQuantizationType.F16 - elif value.dtype == torch.bfloat16: - qtype = gguf.GGMLQuantizationType.BF16 + + # Determine if quantization needed + get target precision + should_quant, target_qtype = should_quantize_tensor(new_key, quant_type) + + # Apply quantization or keep original precision + if should_quant and target_qtype not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16]: + quantized_arr = gguf.quants.quantize(arr, target_qtype) + writer.add_tensor(new_key, quantized_arr, raw_dtype=target_qtype) + tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {target_qtype.name}, shape={arr.shape}") else: - qtype = gguf.GGMLQuantizationType.F16 - writer.add_tensor(new_key, gguf.quants.quantize(arr, qtype), raw_dtype=qtype) - tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {qtype.name}, shape={arr.shape}") + if target_qtype == gguf.GGMLQuantizationType.F32: + arr = arr.astype('float32') + elif target_qtype == gguf.GGMLQuantizationType.BF16: + # BF16 requires special handling + pass # gguf.quants.quantize handles this + else: # F16 + arr = arr.astype('float16') + + quantized_arr = gguf.quants.quantize(arr, target_qtype) + writer.add_tensor(new_key, quantized_arr, raw_dtype=target_qtype) + tqdm.write(f"{key} -> {new_key}, {value.dtype} -> {target_qtype.name}, shape={arr.shape}") + + print("Writing GGUF file...") writer.write_header_to_file(path=dst) writer.write_kv_data_to_file() writer.write_tensors_to_file(progress=True) writer.close() - print(f"转换完成: {dst}") + print(f"Conversion complete: {dst}") + print(f"Quantization type: {ftype_name}") + print(f"File size: {os.path.getsize(dst) / (1024**3):.2f} GB") if __name__ == "__main__": main() - From cc07b1ae92330b03e8216b5dc7d3a92db8564379 Mon Sep 17 00:00:00 2001 From: spawner Date: Fri, 3 Oct 2025 22:11:23 +0800 Subject: [PATCH 5/5] Update loader.py --- loader.py | 148 ++++-------------------------------------------------- 1 file changed, 9 insertions(+), 139 deletions(-) diff --git a/loader.py b/loader.py index 5b0466a..684655b 100644 --- a/loader.py +++ b/loader.py @@ -181,7 +181,7 @@ def gguf_sd_loader(path, handle_prefix="model.diffusion_model.", return_arch=Fal ".attn_norm.weight": ".input_layernorm.weight", ".post_attention_norm.weight": ".post_attention_layernorm.weight", ".post_ffw_norm.weight": ".post_feedforward_layernorm.weight", - ".ffn_norm.weight": ".pre_feedforward_layernorm.weight", # Gemma2 safetensors 只有 pre_feedforward_layernorm + ".ffn_norm.weight": ".pre_feedforward_layernorm.weight", # Gemma2 safetensors only has pre_feedforward_layernorm # MLP ".ffn_up.weight": ".mlp.up_proj.weight", ".ffn_down.weight": ".mlp.down_proj.weight", @@ -333,147 +333,17 @@ def gguf_tokenizer_loader(path, temb_shape): return torch.ByteTensor(list(proto_bytes)) except Exception as e: logging.warning(f"Failed to parse proto from int8 tensor: {e}") - # 继续 fallback - # fallback: 兼容旧字符串字段 + spiece_tensor = reader.get_tensor("tokenizer.ggml.spiece_model_raw") + if spiece_tensor is not None: + del reader + return spiece_tensor raw_proto_field = get_field(reader, "tokenizer.ggml.spiece_model_raw", str) if raw_proto_field is not None: - try: - proto_bytes = raw_proto_field.encode('latin1') - spm = model.ModelProto() - spm.ParseFromString(proto_bytes) - vocab_size = len(spm.pieces) - logging.info(f"✓ Loaded complete sentencepiece proto from GGUF metadata: {vocab_size} pieces, " - f"{len(proto_bytes)} bytes (legacy string field)") - logging.info(f" unk_id={spm.trainer_spec.unk_id}, bos_id={spm.trainer_spec.bos_id}, " - f"eos_id={spm.trainer_spec.eos_id}, pad_id={spm.trainer_spec.pad_id}") - if temb_shape[0] != vocab_size: - logging.warning(f"Proto vocab_size ({vocab_size}) != embedding shape[0] ({temb_shape[0]})") - del reader - return torch.ByteTensor(list(proto_bytes)) - except Exception as e: - logging.warning(f"Failed to load complete proto from metadata: {e}") - logging.warning("Falling back to reconstructing tokenizer from GGUF fields...") - else: - logging.info("No complete sentencepiece proto found in GGUF metadata") - logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...") - - spm = model.ModelProto() - - tokenizer_model = get_field(reader, "tokenizer.ggml.model", str) - if tokenizer_model == "t5": - if temb_shape == (256384, 4096): # probably UMT5 - spm.trainer_spec.model_type == 1 # Unigram (do we have a T5 w/ BPE?) - else: - raise NotImplementedError("Unknown model, can't set tokenizer!") - elif tokenizer_model == "llama": - # Gemma2 uses llama tokenizer model - if temb_shape[0] == 256000: # Gemma2_2B vocab_size - spm.trainer_spec.model_type = 1 # Unigram - else: - raise NotImplementedError("Unknown llama-based model, can't set tokenizer!") - else: - raise NotImplementedError(f"Unknown tokenizer model '{tokenizer_model}', can't set tokenizer!") - - val = get_field(reader, "tokenizer.ggml.add_space_prefix", bool) - spm.normalizer_spec.add_dummy_prefix = val if val is not None else False - val = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool) - spm.normalizer_spec.remove_extra_whitespaces = val if val is not None else False - - tokens = get_list_field(reader, "tokenizer.ggml.tokens", str) - scores = get_list_field(reader, "tokenizer.ggml.scores", float) - toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int) - - eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int) - pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int) - unk_id = get_field(reader, "tokenizer.ggml.unknown_token_id", int) - bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int) - if unk_id is None: - unk_id = get_field(reader, "tokenizer.ggml.unk_token_id", int) - - tokens = list(tokens) - scores = list(scores) - toktypes = list(toktypes) - unk_idxs = [i for i, t in enumerate(tokens) if t == ""] - if unk_idxs: - unk_id = unk_idxs[0] - tokens[unk_id] = "" - toktypes[unk_id] = 2 - for i in reversed(unk_idxs[1:]): - del tokens[i] - del scores[i] - del toktypes[i] - else: - unk_id = len(tokens) - tokens.append("") - scores.append(-100.0) - toktypes.append(2) - def ensure_control(id_val, name): - if id_val is not None and id_val < len(tokens): - if tokens[id_val] == f"<{name}>" and toktypes[id_val] != 3: - toktypes[id_val] = 3 - pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int) - bos_id = get_field(reader, "tokenizer.ggml.bos_token_id", int) - eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int) - ensure_control(pad_id, "pad") - ensure_control(bos_id, "bos") - ensure_control(eos_id, "eos") - - def ensure_token(id_val, type_val, name): - nonlocal tokens, scores, toktypes - if id_val is not None and id_val >= len(tokens): - tokens = list(tokens) + [f"<{name}>"] - scores = list(scores) + [-100.0] - toktypes = list(toktypes) + [type_val] - - ensure_token(pad_id, 0, "pad") - ensure_token(bos_id, 1, "bos") - ensure_token(eos_id, 2, "eos") - - for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)): - piece = spm.SentencePiece() - piece.piece = token - piece.score = score - piece.type = toktype - spm.pieces.append(piece) - - spm.trainer_spec.byte_fallback = True - spm.trainer_spec.vocab_size = len(tokens) - spm.trainer_spec.max_sentence_length = 4096 - if eos_id is not None: - spm.trainer_spec.eos_id = eos_id - if pad_id is not None: - spm.trainer_spec.pad_id = pad_id - if unk_id is not None: - spm.trainer_spec.unk_id = unk_id - if bos_id is not None: - spm.trainer_spec.bos_id = bos_id - - import traceback - try: - vocab_size = len(spm.pieces) - print(f"[GGUF DEBUG] tokenizer vocab_size: {vocab_size}") - print(f"[GGUF DEBUG] token_embd.weight shape: {temb_shape}") - if temb_shape[0] < vocab_size: - print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}!") - raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 小于 tokenizer vocab_size {vocab_size}!") - elif temb_shape[0] > vocab_size: - print(f"[GGUF ERROR] token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}!") - raise RuntimeError(f"token_embd.weight 行数 {temb_shape[0]} 大于 tokenizer vocab_size {vocab_size}!") - - special_ids = [("unk_id", unk_id), ("pad_id", pad_id), ("bos_id", bos_id), ("eos_id", eos_id)] - for name, tid in special_ids: - print(f"[GGUF DEBUG] {name}: {tid}") - if tid is None or not (0 <= tid < vocab_size): - print(f"[GGUF ERROR] {name}={tid} 不在合法范围 0, {vocab_size}!") - raise RuntimeError(f"{name}={tid} 不在合法范围 0, {vocab_size}!") - except Exception as e: - print("[GGUF DEBUG] 发生异常:", e) - traceback.print_exc() - raise - - logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)} (unk_id={unk_id}, pad_id={pad_id}, bos_id={bos_id}, eos_id={eos_id})") + proto_bytes = raw_proto_field.encode('latin1') + del reader + return torch.ByteTensor(list(proto_bytes)) del reader - return torch.ByteTensor(list(spm.SerializeToString())) + raise NotImplementedError("No sentencepiece proto found in GGUF metadata!") def gguf_clip_loader(path): sd, arch = gguf_sd_loader(path, return_arch=True, is_text_model=True)