Skip to content

Commit 00e5e88

Browse files
committed
add codeshell support
1 parent 11dc109 commit 00e5e88

File tree

3 files changed

+708
-2
lines changed

3 files changed

+708
-2
lines changed

convert-codeshell-hf-to-gguf.py

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
#!/usr/bin/env python3
2+
# HF CodeShell --> gguf conversion
3+
4+
from __future__ import annotations
5+
6+
import argparse
7+
import json
8+
import os
9+
import struct
10+
import sys
11+
from pathlib import Path
12+
from typing import Any
13+
14+
import numpy as np
15+
import torch
16+
from transformers import AutoTokenizer # type: ignore[import]
17+
18+
if 'NO_LOCAL_GGUF' not in os.environ:
19+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20+
import gguf
21+
22+
def count_model_parts(dir_model: Path) -> int:
23+
num_parts = 0
24+
for filename in os.listdir(dir_model):
25+
if filename.startswith("pytorch_model-"):
26+
num_parts += 1
27+
28+
if num_parts > 0:
29+
print("gguf: found " + str(num_parts) + " model parts")
30+
return num_parts
31+
32+
33+
def parse_args() -> argparse.Namespace:
34+
parser = argparse.ArgumentParser(description="Convert a CodeShell model to a GGML compatible file")
35+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
36+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
37+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
38+
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
39+
return parser.parse_args()
40+
41+
args = parse_args()
42+
43+
dir_model = args.model
44+
ftype = args.ftype
45+
if not dir_model.is_dir():
46+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
47+
sys.exit(1)
48+
49+
# possible tensor data types
50+
# ftype == 0 -> float32
51+
# ftype == 1 -> float16
52+
53+
# map from ftype to string
54+
ftype_str = ["f32", "f16"]
55+
56+
if args.outfile is not None:
57+
fname_out = args.outfile
58+
else:
59+
# output in the same directory as the model by default
60+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
61+
62+
print("gguf: loading model "+dir_model.name)
63+
64+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
65+
hparams = json.load(f)
66+
67+
if hparams["architectures"][0] != "CodeShellForCausalLM":
68+
print("Model architecture not supported: " + hparams["architectures"][0])
69+
sys.exit(1)
70+
71+
# get number of model parts
72+
num_parts = count_model_parts(dir_model)
73+
74+
ARCH = gguf.MODEL_ARCH.CODESHELL
75+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
76+
77+
print("gguf: get model metadata")
78+
79+
block_count = hparams["n_layer"]
80+
81+
gguf_writer.add_name("CodeShell")
82+
gguf_writer.add_context_length(hparams["n_positions"])
83+
gguf_writer.add_embedding_length(hparams["n_embd"])
84+
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
85+
gguf_writer.add_block_count(block_count)
86+
gguf_writer.add_head_count(hparams["n_head"])
87+
gguf_writer.add_head_count_kv(hparams["num_query_groups"])
88+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
89+
gguf_writer.add_file_type(ftype)
90+
gguf_writer.add_rope_freq_base(10000.0)
91+
gguf_writer.add_rope_scale_linear(1.0)
92+
93+
# TOKENIZATION
94+
95+
print("gguf: get tokenizer metadata")
96+
97+
tokens: list[bytearray] = []
98+
scores: list[float] = []
99+
toktypes: list[int] = []
100+
101+
# gpt2 tokenizer
102+
gguf_writer.add_tokenizer_model("gpt2")
103+
104+
print("gguf: get gpt2 tokenizer vocab")
105+
106+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
107+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
108+
109+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
110+
# This causes downstream issues with mismatched tensor sizes when running the inference
111+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
112+
assert max(tokenizer.vocab.values()) < vocab_size
113+
114+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
115+
116+
for i in range(vocab_size):
117+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
118+
scores.append(0.0) # dummy
119+
toktypes.append(gguf.TokenType.NORMAL)
120+
121+
gguf_writer.add_token_list(tokens)
122+
gguf_writer.add_token_scores(scores)
123+
gguf_writer.add_token_types(toktypes)
124+
125+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
126+
special_vocab.add_to_gguf(gguf_writer)
127+
128+
# TENSORS
129+
130+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
131+
132+
# params for qkv transform
133+
n_head = hparams["n_head"]
134+
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
135+
136+
head_dim = hparams["n_embd"] // n_head
137+
138+
# tensor info
139+
print("gguf: get tensor metadata")
140+
141+
if num_parts == 0:
142+
part_names = iter(("pytorch_model.bin",))
143+
else:
144+
part_names = (
145+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
146+
)
147+
148+
for part_name in part_names:
149+
if args.vocab_only:
150+
break
151+
print("gguf: loading model part '" + part_name + "'")
152+
model_part = torch.load(dir_model / part_name, map_location="cpu")
153+
154+
for name in model_part.keys():
155+
data = model_part[name]
156+
157+
old_dtype = data.dtype
158+
159+
# convert any unsupported data types to float32
160+
if data.dtype != torch.float16 and data.dtype != torch.float32:
161+
data = data.to(torch.float32)
162+
163+
data = data.squeeze().numpy()
164+
165+
# map tensor names
166+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
167+
if new_name is None:
168+
print("Can not map tensor '" + name + "'")
169+
sys.exit()
170+
171+
if "rotary_emb.inv_freq" in name:
172+
print(f"skipping tensor {new_name}")
173+
continue
174+
175+
n_dims = len(data.shape)
176+
data_dtype = data.dtype
177+
178+
# if f32 desired, convert any float16 to float32
179+
if ftype == 0 and data_dtype == np.float16:
180+
data = data.astype(np.float32)
181+
182+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
183+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
184+
data = data.astype(np.float32)
185+
186+
# if f16 desired, convert any float32 2-dim weight tensors to float16
187+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
188+
data = data.astype(np.float16)
189+
190+
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
191+
192+
gguf_writer.add_tensor(new_name, data)
193+
194+
195+
print("gguf: write header")
196+
gguf_writer.write_header_to_file()
197+
print("gguf: write metadata")
198+
gguf_writer.write_kv_data_to_file()
199+
if not args.vocab_only:
200+
print("gguf: write tensors")
201+
gguf_writer.write_tensors_to_file()
202+
203+
gguf_writer.close()
204+
205+
print(f"gguf: model successfully exported to '{fname_out}'")
206+
print("")

gguf-py/gguf/gguf.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ class MODEL_ARCH(IntEnum):
8989
REFACT : int = auto()
9090
BERT : int = auto()
9191
BLOOM : int = auto()
92+
CODESHELL : int = auto()
9293

9394

9495
class MODEL_TENSOR(IntEnum):
@@ -128,6 +129,7 @@ class MODEL_TENSOR(IntEnum):
128129
MODEL_ARCH.REFACT: "refact",
129130
MODEL_ARCH.BERT: "bert",
130131
MODEL_ARCH.BLOOM: "bloom",
132+
MODEL_ARCH.CODESHELL: "codeshell",
131133
}
132134

133135
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -298,6 +300,19 @@ class MODEL_TENSOR(IntEnum):
298300
MODEL_TENSOR.FFN_DOWN,
299301
MODEL_TENSOR.FFN_UP,
300302
],
303+
MODEL_ARCH.CODESHELL: {
304+
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
305+
MODEL_TENSOR.POS_EMBD: "position_embd",
306+
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
307+
MODEL_TENSOR.OUTPUT: "output",
308+
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
309+
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
310+
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
311+
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
312+
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
313+
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
314+
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
315+
},
301316
MODEL_ARCH.GPT2: [
302317
# TODO
303318
],
@@ -316,7 +331,11 @@ class MODEL_TENSOR(IntEnum):
316331
],
317332
MODEL_ARCH.PERSIMMON: [
318333
MODEL_TENSOR.ROPE_FREQS,
319-
]
334+
],
335+
MODEL_ARCH.CODESHELL: [
336+
MODEL_TENSOR.ROPE_FREQS,
337+
MODEL_TENSOR.ATTN_ROT_EMBD,
338+
],
320339
}
321340

322341

@@ -448,6 +467,7 @@ class TensorNameMap:
448467
MODEL_TENSOR.ATTN_ROT_EMBD: (
449468
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
450469
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
470+
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
451471
),
452472

453473
# Feed-forward norm

0 commit comments

Comments
 (0)