| 
 | 1 | +#!/usr/bin/env python3  | 
 | 2 | +# HF CodeShell --> gguf conversion  | 
 | 3 | + | 
 | 4 | +from __future__ import annotations  | 
 | 5 | + | 
 | 6 | +import argparse  | 
 | 7 | +import json  | 
 | 8 | +import os  | 
 | 9 | +import struct  | 
 | 10 | +import sys  | 
 | 11 | +from pathlib import Path  | 
 | 12 | +from typing import Any  | 
 | 13 | + | 
 | 14 | +import numpy as np  | 
 | 15 | +import torch  | 
 | 16 | +from transformers import AutoTokenizer  # type: ignore[import]  | 
 | 17 | + | 
 | 18 | +if 'NO_LOCAL_GGUF' not in os.environ:  | 
 | 19 | +    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))  | 
 | 20 | +import gguf  | 
 | 21 | + | 
 | 22 | +def count_model_parts(dir_model: Path) -> int:  | 
 | 23 | +    num_parts = 0  | 
 | 24 | +    for filename in os.listdir(dir_model):  | 
 | 25 | +        if filename.startswith("pytorch_model-"):  | 
 | 26 | +            num_parts += 1  | 
 | 27 | + | 
 | 28 | +    if num_parts > 0:  | 
 | 29 | +        print("gguf: found " + str(num_parts) + " model parts")  | 
 | 30 | +    return num_parts  | 
 | 31 | + | 
 | 32 | + | 
 | 33 | +def parse_args() -> argparse.Namespace:  | 
 | 34 | +    parser = argparse.ArgumentParser(description="Convert a CodeShell model to a GGML compatible file")  | 
 | 35 | +    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")  | 
 | 36 | +    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")  | 
 | 37 | +    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")  | 
 | 38 | +    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)  | 
 | 39 | +    return parser.parse_args()  | 
 | 40 | + | 
 | 41 | +args = parse_args()  | 
 | 42 | + | 
 | 43 | +dir_model = args.model  | 
 | 44 | +ftype = args.ftype  | 
 | 45 | +if not dir_model.is_dir():  | 
 | 46 | +    print(f'Error: {args.model} is not a directory', file = sys.stderr)  | 
 | 47 | +    sys.exit(1)  | 
 | 48 | + | 
 | 49 | +# possible tensor data types  | 
 | 50 | +#   ftype == 0 -> float32  | 
 | 51 | +#   ftype == 1 -> float16  | 
 | 52 | + | 
 | 53 | +# map from ftype to string  | 
 | 54 | +ftype_str = ["f32", "f16"]  | 
 | 55 | + | 
 | 56 | +if args.outfile is not None:  | 
 | 57 | +    fname_out = args.outfile  | 
 | 58 | +else:  | 
 | 59 | +    # output in the same directory as the model by default  | 
 | 60 | +    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'  | 
 | 61 | + | 
 | 62 | +print("gguf: loading model "+dir_model.name)  | 
 | 63 | + | 
 | 64 | +with open(dir_model / "config.json", "r", encoding="utf-8") as f:  | 
 | 65 | +    hparams = json.load(f)  | 
 | 66 | + | 
 | 67 | +if hparams["architectures"][0] != "CodeShellForCausalLM":  | 
 | 68 | +    print("Model architecture not supported: " + hparams["architectures"][0])  | 
 | 69 | +    sys.exit(1)  | 
 | 70 | + | 
 | 71 | +# get number of model parts  | 
 | 72 | +num_parts = count_model_parts(dir_model)  | 
 | 73 | + | 
 | 74 | +ARCH = gguf.MODEL_ARCH.CODESHELL  | 
 | 75 | +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])  | 
 | 76 | + | 
 | 77 | +print("gguf: get model metadata")  | 
 | 78 | + | 
 | 79 | +block_count = hparams["n_layer"]  | 
 | 80 | + | 
 | 81 | +gguf_writer.add_name("CodeShell")  | 
 | 82 | +gguf_writer.add_context_length(hparams["n_positions"])  | 
 | 83 | +gguf_writer.add_embedding_length(hparams["n_embd"])  | 
 | 84 | +gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])  | 
 | 85 | +gguf_writer.add_block_count(block_count)  | 
 | 86 | +gguf_writer.add_head_count(hparams["n_head"])  | 
 | 87 | +gguf_writer.add_head_count_kv(hparams["num_query_groups"])  | 
 | 88 | +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])  | 
 | 89 | +gguf_writer.add_file_type(ftype)  | 
 | 90 | +gguf_writer.add_rope_freq_base(10000.0)  | 
 | 91 | +gguf_writer.add_rope_scale_linear(1.0)  | 
 | 92 | + | 
 | 93 | +# TOKENIZATION  | 
 | 94 | + | 
 | 95 | +print("gguf: get tokenizer metadata")  | 
 | 96 | + | 
 | 97 | +tokens: list[bytearray] = []  | 
 | 98 | +scores: list[float] = []  | 
 | 99 | +toktypes: list[int] = []  | 
 | 100 | + | 
 | 101 | +# gpt2 tokenizer  | 
 | 102 | +gguf_writer.add_tokenizer_model("gpt2")  | 
 | 103 | + | 
 | 104 | +print("gguf: get gpt2 tokenizer vocab")  | 
 | 105 | + | 
 | 106 | +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py  | 
 | 107 | +tokenizer = AutoTokenizer.from_pretrained(dir_model)  | 
 | 108 | + | 
 | 109 | +# The number of tokens in tokenizer.json can differ from the expected vocab size.  | 
 | 110 | +# This causes downstream issues with mismatched tensor sizes when running the inference  | 
 | 111 | +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))  | 
 | 112 | +assert max(tokenizer.vocab.values()) < vocab_size  | 
 | 113 | + | 
 | 114 | +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}  | 
 | 115 | + | 
 | 116 | +for i in range(vocab_size):  | 
 | 117 | +    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")  | 
 | 118 | +    scores.append(0.0) # dummy  | 
 | 119 | +    toktypes.append(gguf.TokenType.NORMAL)  | 
 | 120 | + | 
 | 121 | +gguf_writer.add_token_list(tokens)  | 
 | 122 | +gguf_writer.add_token_scores(scores)  | 
 | 123 | +gguf_writer.add_token_types(toktypes)  | 
 | 124 | + | 
 | 125 | +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)  | 
 | 126 | +special_vocab.add_to_gguf(gguf_writer)  | 
 | 127 | + | 
 | 128 | +# TENSORS  | 
 | 129 | + | 
 | 130 | +tensor_map = gguf.get_tensor_name_map(ARCH,block_count)  | 
 | 131 | + | 
 | 132 | +# params for qkv transform  | 
 | 133 | +n_head    = hparams["n_head"]  | 
 | 134 | +n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1  | 
 | 135 | + | 
 | 136 | +head_dim = hparams["n_embd"] // n_head  | 
 | 137 | + | 
 | 138 | +# tensor info  | 
 | 139 | +print("gguf: get tensor metadata")  | 
 | 140 | + | 
 | 141 | +if num_parts == 0:  | 
 | 142 | +    part_names = iter(("pytorch_model.bin",))  | 
 | 143 | +else:  | 
 | 144 | +    part_names = (  | 
 | 145 | +        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)  | 
 | 146 | +    )  | 
 | 147 | + | 
 | 148 | +for part_name in part_names:  | 
 | 149 | +    if args.vocab_only:  | 
 | 150 | +        break  | 
 | 151 | +    print("gguf: loading model part '" + part_name + "'")  | 
 | 152 | +    model_part = torch.load(dir_model / part_name, map_location="cpu")  | 
 | 153 | + | 
 | 154 | +    for name in model_part.keys():  | 
 | 155 | +        data = model_part[name]  | 
 | 156 | + | 
 | 157 | +        old_dtype = data.dtype  | 
 | 158 | + | 
 | 159 | +        # convert any unsupported data types to float32  | 
 | 160 | +        if data.dtype != torch.float16 and data.dtype != torch.float32:  | 
 | 161 | +            data = data.to(torch.float32)  | 
 | 162 | + | 
 | 163 | +        data = data.squeeze().numpy()  | 
 | 164 | + | 
 | 165 | +        # map tensor names  | 
 | 166 | +        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))  | 
 | 167 | +        if new_name is None:  | 
 | 168 | +            print("Can not map tensor '" + name + "'")  | 
 | 169 | +            sys.exit()  | 
 | 170 | + | 
 | 171 | +        if "rotary_emb.inv_freq" in name:  | 
 | 172 | +            print(f"skipping tensor {new_name}")  | 
 | 173 | +            continue  | 
 | 174 | + | 
 | 175 | +        n_dims = len(data.shape)  | 
 | 176 | +        data_dtype = data.dtype  | 
 | 177 | + | 
 | 178 | +        # if f32 desired, convert any float16 to float32  | 
 | 179 | +        if ftype == 0 and data_dtype == np.float16:  | 
 | 180 | +            data = data.astype(np.float32)  | 
 | 181 | + | 
 | 182 | +        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32  | 
 | 183 | +        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:  | 
 | 184 | +            data = data.astype(np.float32)  | 
 | 185 | + | 
 | 186 | +        # if f16 desired, convert any float32 2-dim weight tensors to float16  | 
 | 187 | +        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:  | 
 | 188 | +            data = data.astype(np.float16)  | 
 | 189 | + | 
 | 190 | +        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))  | 
 | 191 | + | 
 | 192 | +        gguf_writer.add_tensor(new_name, data)  | 
 | 193 | + | 
 | 194 | + | 
 | 195 | +print("gguf: write header")  | 
 | 196 | +gguf_writer.write_header_to_file()  | 
 | 197 | +print("gguf: write metadata")  | 
 | 198 | +gguf_writer.write_kv_data_to_file()  | 
 | 199 | +if not args.vocab_only:  | 
 | 200 | +    print("gguf: write tensors")  | 
 | 201 | +    gguf_writer.write_tensors_to_file()  | 
 | 202 | + | 
 | 203 | +gguf_writer.close()  | 
 | 204 | + | 
 | 205 | +print(f"gguf: model successfully exported to '{fname_out}'")  | 
 | 206 | +print("")  | 
0 commit comments