Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

migrate PR [LLM Runtime]Magicoder graph #41

Merged
merged 10 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,14 @@ Neural Speed supports the following models:
<td>✅</td>
<td>✅</td>
<td>Latest</td>
</tr>
<tr>
<td><a href="https://huggingface.co/ise-uiuc/Magicoder-S-DS-6.7B" target="_blank" rel="noopener noreferrer">Magicoder-6.7B</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>✅</td>
<td>Latest</td>
</tr>
<tr>
<td><a href="https://huggingface.co/bigcode/starcoderbase-1b" target="_blank" rel="noopener noreferrer">StarCoder-1B</a>,
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", hparams["intermediate_size"]))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_dolly.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_gptneox.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
74 changes: 48 additions & 26 deletions neural_speed/convert/convert_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ class Params:
ffn_hidden_size: int
rms_norm_eps: float
rope_theta: float
rope_scale: float
bos_token_id: int
eos_token_id: int

@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
Expand Down Expand Up @@ -178,6 +181,11 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size = config["intermediate_size"]
rms_norm_eps = config["rms_norm_eps"]
rope_theta = config["rope_theta"] if "rope_theta" in config else 10000
rope_scale = 1
if config["rope_scaling"]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please check whether "rope_scaling" in config

Suggested change
if config["rope_scaling"]:
if "rope_scaling" in config and config["rope_scaling"] is not None:

rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
bos_token_id = config["bos_token_id"]
eos_token_id = config["eos_token_id"]

return Params(
n_vocab=n_vocab,
Expand All @@ -189,6 +197,9 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size=ffn_hidden_size,
rms_norm_eps=rms_norm_eps,
rope_theta=rope_theta,
rope_scale=rope_scale,
bos_token_id = bos_token_id,
eos_token_id = eos_token_id,
)

# LLaMA v2 70B params.json
Expand All @@ -204,6 +215,8 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
ffn_hidden_size = config["intermediate_size"]
bos_token_id = config["bos_token_id"]
eos_token_id = config["eos_token_id"]
# hack to determine LLaMA v1 vs v2 vs CodeLlama

if n_vocab == -1:
Expand All @@ -217,6 +230,8 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
n_head=n_head,
n_head_kv=n_head_kv,
ffn_hidden_size=ffn_hidden_size,
bos_token_id = bos_token_id,
eos_token_id = eos_token_id,
)

@staticmethod
Expand All @@ -239,7 +254,7 @@ def load(model: 'ModelPlus') -> 'Params':


class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_tokens: Optional[Path]) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: Dict[str, int]
if fname_added_tokens is not None:
Expand All @@ -258,25 +273,31 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
self.params_vocab_size = params_vocab_size

def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
text: bytes
if tokenizer.is_unknown(i):
for i in range(self.params_vocab_size):
text: bytes
if i < tokenizer.vocab_size():
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
raise Exception(f"Invalid token: {piece}")
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
score: float = tokenizer.get_score(i)
yield text, score
else :
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
raise Exception(f"Invalid token: {piece}")
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
score: float = tokenizer.get_score(i)
yield text, score
score: float = i
yield text, score

def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
Expand Down Expand Up @@ -1063,13 +1084,14 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:

self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))

# TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
# but bos_token_id = 1 in llama.cpp
self.fout.write(struct.pack("i", 1))
self.fout.write(struct.pack("i", 2))
self.fout.write(struct.pack("i", 0))
self.fout.write(struct.pack("i", 0))
self.fout.write(struct.pack("i", params.bos_token_id))
self.fout.write(struct.pack("i", params.eos_token_id))
self.fout.write(struct.pack("i", -1))
self.fout.write(struct.pack("i", -1))

def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
sname = name.encode('utf-8')
Expand All @@ -1095,7 +1117,7 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:

@staticmethod
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType) -> None:
check_vocab_size(params, vocab)
#check_vocab_size(params, vocab)
of = OutputFile(fname_out)
of.write_file_header(params, file_type)
print("Writing vocab...")
Expand Down Expand Up @@ -1224,7 +1246,7 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
return {name: model[name] for name in TENSORS_LIST if name in model}


def load_vocab(path: Path) -> SentencePieceVocab:
def load_vocab(path: Path, params_vocab_size: int) -> SentencePieceVocab:
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
Expand All @@ -1243,7 +1265,7 @@ def load_vocab(path: Path) -> SentencePieceVocab:
)
added_tokens_path = path.parent / "added_tokens.json"
print(f"Loading vocab file {path}")
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
return SentencePieceVocab(path, params_vocab_size, added_tokens_path if added_tokens_path.exists() else None)


def default_outfile(model_paths: List[Path], params: Params) -> Path:
Expand Down Expand Up @@ -1306,13 +1328,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
model = model_plus.model
params = Params.load(model_plus)
if model_plus.vocab is not None and args.vocab_dir is None:
vocab = model_plus.vocab
else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir)
model = model_plus.model
params = Params.load(model_plus)
vocab = load_vocab(vocab_dir, params.n_vocab)
model = do_necessary_conversions(model, params)
output_type = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, output_type)
Expand Down
4 changes: 4 additions & 0 deletions neural_speed/convert/convert_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ class Params:
ffn_hidden_size: int
rms_norm_eps: float
rope_theta: float
rope_scale: float

@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
Expand Down Expand Up @@ -179,6 +180,8 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
ffn_hidden_size = config["intermediate_size"]
rms_norm_eps = config["rms_norm_eps"]
rope_theta = config["rope_theta"] if "rope_theta" in config else 10000
rope_scale = config["factor"] if "factor" in config else 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mistral should align to llama

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please help update convert convert_quantized_llama.py and convert_quantized_mistral.py



return Params(
n_vocab=n_vocab,
Expand Down Expand Up @@ -1057,6 +1060,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(struct.pack("i", 0))
self.fout.write(struct.pack("f", params.rms_norm_eps))
self.fout.write(struct.pack("f", params.rope_theta))
self.fout.write(struct.pack("f", params.rope_scale))

self.fout.write(
struct.pack("i", 1)
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/convert_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base

fout.write(struct.pack("f", 1.0)) # rope_factor
fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
Expand Down
1 change: 1 addition & 0 deletions neural_speed/convert/convert_starcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("i", 0))
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor

fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
Expand Down
Loading