diff --git a/README.md b/README.md
index 5f703ca8e..0361d65eb 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,14 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+  </tr>
+    <tr>
+    <td><a href="https://huggingface.co/ise-uiuc/Magicoder-S-DS-6.7B" target="_blank" rel="noopener noreferrer">Magicoder-6.7B</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>Latest</td>
   </tr>
   <tr>
     <td><a href="https://huggingface.co/bigcode/starcoderbase-1b" target="_blank" rel="noopener noreferrer">StarCoder-1B</a>,
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index 08514dbdc..1b9412382 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -132,8 +132,14 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
         self.model = self.module.Model()
         if "threads" not in generate_kwargs:
             threads = os.getenv("OMP_NUM_THREADS")
+            import platform
+            sys_platform = platform.platform().lower()
             if threads is None:
-                generate_kwargs["threads"] = len(os.sched_getaffinity(0))
+                if "windows" in sys_platform:
+                    cpu_count = os.cpu_count()
+                    generate_kwargs["threads"] = int(cpu_count)
+                else:
+                    generate_kwargs["threads"] = len(os.sched_getaffinity(0))
             else:
                 generate_kwargs["threads"] = int(threads)
         self.model.init_model(model_path, **generate_kwargs)
diff --git a/neural_speed/convert/convert_baichuan.py b/neural_speed/convert/convert_baichuan.py
index 247f90eae..bbeb3c6d0 100644
--- a/neural_speed/convert/convert_baichuan.py
+++ b/neural_speed/convert/convert_baichuan.py
@@ -158,6 +158,7 @@ def baichuan13B_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", hparams["intermediate_size"]))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
 
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_bloom.py b/neural_speed/convert/convert_bloom.py
index 3fca71b51..7e2a3f805 100644
--- a/neural_speed/convert/convert_bloom.py
+++ b/neural_speed/convert/convert_bloom.py
@@ -101,6 +101,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
 
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_chatglm.py b/neural_speed/convert/convert_chatglm.py
index 38f986809..0eed8e130 100644
--- a/neural_speed/convert/convert_chatglm.py
+++ b/neural_speed/convert/convert_chatglm.py
@@ -358,6 +358,7 @@ def chatglm2_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("layernorm_epsilon", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
 
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
index f5b589f96..dc77b1c43 100644
--- a/neural_speed/convert/convert_dolly.py
+++ b/neural_speed/convert/convert_dolly.py
@@ -115,6 +115,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
     
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index 2f1ddd2b8..9d323f89d 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -109,6 +109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
     
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
index c7b89b2d8..2f6c8e673 100644
--- a/neural_speed/convert/convert_gptj.py
+++ b/neural_speed/convert/convert_gptj.py
@@ -101,6 +101,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
     
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
index 3b67daa31..409cc05ba 100644
--- a/neural_speed/convert/convert_gptneox.py
+++ b/neural_speed/convert/convert_gptneox.py
@@ -115,6 +115,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
     
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 8b448c95d..511608d52 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -151,6 +151,9 @@ class Params:
     ffn_hidden_size: int
     rms_norm_eps: float
     rope_theta: float
+    rope_scale: float
+    bos_token_id: int
+    eos_token_id: int
 
     @staticmethod
     def guessed(model: 'LazyModel') -> 'Params':
@@ -180,6 +183,11 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
         ffn_hidden_size = config["intermediate_size"]
         rms_norm_eps = config["rms_norm_eps"]
         rope_theta = config["rope_theta"] if "rope_theta" in config else 10000
+        rope_scale = 1
+        if "rope_scaling" in config and config["rope_scaling"] is not None:
+            rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
+        bos_token_id = config["bos_token_id"]
+        eos_token_id = config["eos_token_id"]
 
         return Params(
             n_vocab=n_vocab,
@@ -191,6 +199,9 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
             ffn_hidden_size=ffn_hidden_size,
             rms_norm_eps=rms_norm_eps,
             rope_theta=rope_theta,
+            rope_scale=rope_scale,
+            bos_token_id = bos_token_id,
+            eos_token_id = eos_token_id,
         )
 
     # LLaMA v2 70B params.json
@@ -206,6 +217,8 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
         n_head = config["n_heads"]
         n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
         ffn_hidden_size = config["intermediate_size"]
+        bos_token_id = config["bos_token_id"]
+        eos_token_id = config["eos_token_id"]
         # hack to determine LLaMA v1 vs v2 vs CodeLlama
 
         if n_vocab == -1:
@@ -219,6 +232,8 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params':
             n_head=n_head,
             n_head_kv=n_head_kv,
             ffn_hidden_size=ffn_hidden_size,
+            bos_token_id = bos_token_id,
+            eos_token_id = eos_token_id,
         )
 
     @staticmethod
@@ -241,11 +256,11 @@ def load(model: 'ModelPlus') -> 'Params':
 
 
 class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+    def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_tokens: Optional[Path]) -> None:
         self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
         added_tokens: Dict[str, int]
         if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens))
+            added_tokens = json.load(open(fname_added_tokens, encoding='utf-8'))
         else:
             added_tokens = {}
         vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
@@ -260,25 +275,31 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
         self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
         self.fname_tokenizer = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
+        self.params_vocab_size = params_vocab_size
 
     def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.sentencepiece_tokenizer
-        for i in range(tokenizer.vocab_size()):
-            text: bytes
-            if tokenizer.is_unknown(i):
+        for i in range(self.params_vocab_size):
+            text: bytes           
+            if i < tokenizer.vocab_size():
+                if tokenizer.is_unknown(i):
+                    text = " \u2047 ".encode("utf-8")
+                elif tokenizer.is_control(i):
+                    text = b""
+                elif tokenizer.is_byte(i):
+                    piece = tokenizer.id_to_piece(i)
+                    if len(piece) != 6:
+                        raise Exception(f"Invalid token: {piece}")
+                    byte_value = int(piece[3:-1], 16)
+                    text = struct.pack("B", byte_value)
+                else:
+                    text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+                score: float = tokenizer.get_score(i)
+                yield text, score
+            else :
                 text = " \u2047 ".encode("utf-8")
-            elif tokenizer.is_control(i):
-                text = b""
-            elif tokenizer.is_byte(i):
-                piece = tokenizer.id_to_piece(i)
-                if len(piece) != 6:
-                    raise Exception(f"Invalid token: {piece}")
-                byte_value = int(piece[3:-1], 16)
-                text = struct.pack("B", byte_value)
-            else:
-                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-            score: float = tokenizer.get_score(i)
-            yield text, score
+                score: float = i
+                yield text, score
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
@@ -1066,13 +1087,14 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
 
         self.fout.write(struct.pack("f", params.rms_norm_eps))
         self.fout.write(struct.pack("f", params.rope_theta))
+        self.fout.write(struct.pack("f", params.rope_scale))
 
         # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
         # but bos_token_id = 1 in llama.cpp
-        self.fout.write(struct.pack("i", 1))
-        self.fout.write(struct.pack("i", 2))
-        self.fout.write(struct.pack("i", 0))
-        self.fout.write(struct.pack("i", 0))
+        self.fout.write(struct.pack("i", params.bos_token_id))
+        self.fout.write(struct.pack("i", params.eos_token_id))
+        self.fout.write(struct.pack("i", -1))
+        self.fout.write(struct.pack("i", -1))
 
     def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
         sname = name.encode('utf-8')
@@ -1103,7 +1125,7 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
 
     @staticmethod
     def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab, file_type: NEFileType) -> None:
-        check_vocab_size(params, vocab)
+        #check_vocab_size(params, vocab)
         of = OutputFile(fname_out)
         of.write_file_header(params, file_type)
         print("Writing vocab...")
@@ -1345,7 +1367,7 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
     return {name: model[name] for name in TENSORS_LIST if name in model}
 
 
-def load_vocab(path: Path) -> SentencePieceVocab:
+def load_vocab(path: Path, params_vocab_size: int) -> SentencePieceVocab:
     # Be extra-friendly and accept either a file or a directory.  Also, if it's
     # a directory, it might be the model directory, and tokenizer.model might
     # be in the parent of that.
@@ -1363,7 +1385,7 @@ def load_vocab(path: Path) -> SentencePieceVocab:
                 pass the directory as --vocab-dir")
     added_tokens_path = path.parent / "added_tokens.json"
     print(f"Loading vocab file {path}")
-    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    return SentencePieceVocab(path, params_vocab_size, added_tokens_path if added_tokens_path.exists() else None)
 
 
 def default_outfile(model_paths: List[Path], params: Params) -> Path:
@@ -1430,15 +1452,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
         if args.dump:
             do_dump_model(model_plus)
             return
+        model = model_plus.model
+        params = Params.load(model_plus)
 
         if model_plus.vocab is not None and args.vocab_dir is None:
             vocab = model_plus.vocab
         else:
             vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-            vocab = load_vocab(vocab_dir)
-
-        model = model_plus.model
-        params = Params.load(model_plus)
+            vocab = load_vocab(vocab_dir, params.n_vocab)
         model = do_necessary_conversions(model, params)
         output_type = pick_output_type(model, args.outtype)
         model = convert_to_output_type(model, output_type)
diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
index 7bd6cfca0..71a195fcc 100644
--- a/neural_speed/convert/convert_mistral.py
+++ b/neural_speed/convert/convert_mistral.py
@@ -151,6 +151,7 @@ class Params:
     ffn_hidden_size: int
     rms_norm_eps: float
     rope_theta: float
+    rope_scale: float
 
     @staticmethod
     def guessed(model: 'LazyModel') -> 'Params':
@@ -179,6 +180,10 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
         ffn_hidden_size = config["intermediate_size"]
         rms_norm_eps = config["rms_norm_eps"]
         rope_theta = config["rope_theta"] if "rope_theta" in config else 10000
+        rope_scale = 1
+        if "rope_scaling" in config and config["rope_scaling"] is not None:
+            rope_scale = config["rope_scaling"]["factor"] if "factor" in config["rope_scaling"] else 1
+
 
         return Params(
             n_vocab=n_vocab,
@@ -190,6 +195,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params':
             ffn_hidden_size=ffn_hidden_size,
             rms_norm_eps=rms_norm_eps,
             rope_theta=rope_theta,
+            rope_scale=rope_scale,
         )
 
     # LLaMA v2 70B params.json
@@ -244,7 +250,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
         self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
         added_tokens: Dict[str, int]
         if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens))
+            added_tokens = json.load(open(fname_added_tokens, encoding='utf-8'))
         else:
             added_tokens = {}
         vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
@@ -1057,6 +1063,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
         self.fout.write(struct.pack("i", 0))
         self.fout.write(struct.pack("f", params.rms_norm_eps))
         self.fout.write(struct.pack("f", params.rope_theta))
+        self.fout.write(struct.pack("f", params.rope_scale))
 
         self.fout.write(
             struct.pack("i", 1)
diff --git a/neural_speed/convert/convert_mpt.py b/neural_speed/convert/convert_mpt.py
index 2bdf0ec4a..c441a9607 100644
--- a/neural_speed/convert/convert_mpt.py
+++ b/neural_speed/convert/convert_mpt.py
@@ -97,7 +97,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
-    
+    fout.write(struct.pack("f", 1.0))  # rope_factor
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
index 76c27c057..4f487f68c 100644
--- a/neural_speed/convert/convert_opt.py
+++ b/neural_speed/convert/convert_opt.py
@@ -108,6 +108,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
     
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_quantized_bloom.py b/neural_speed/convert/convert_quantized_bloom.py
index c0e88a2d7..823831efa 100644
--- a/neural_speed/convert/convert_quantized_bloom.py
+++ b/neural_speed/convert/convert_quantized_bloom.py
@@ -170,7 +170,7 @@ def bytes_to_unicode():
 f.write(struct.pack("i", 0))
 f.write(struct.pack("i", 0))
 f.write(struct.pack("i", 0))
-fout.write(struct.pack("f", 1e-6))  # rms norm eps
+f.write(struct.pack("f", 1e-6))  # rms norm eps
 
 f.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
 f.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
index 6b9bfd79d..5403cd3e3 100644
--- a/neural_speed/convert/convert_quantized_llama.py
+++ b/neural_speed/convert/convert_quantized_llama.py
@@ -80,6 +80,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     f.write(struct.pack("f", config["rms_norm_eps"]))
     f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000))
+    f.write(struct.pack("f", config["rope_scale"]))
 
     # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
     # but bos_token_id = 1 in llama.cpp
diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py
index 3552320f7..e09a9ad02 100644
--- a/neural_speed/convert/convert_quantized_mistral.py
+++ b/neural_speed/convert/convert_quantized_mistral.py
@@ -83,6 +83,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     f.write(struct.pack("f", config["rms_norm_eps"]))
     f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000))
+    f.write(struct.pack("f", config["rope_scale"]))
 
     # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json 
     # but bos_token_id = 1 in llama.cpp
diff --git a/neural_speed/convert/convert_qwen.py b/neural_speed/convert/convert_qwen.py
index c559dd398..a4b9c7f4f 100644
--- a/neural_speed/convert/convert_qwen.py
+++ b/neural_speed/convert/convert_qwen.py
@@ -114,6 +114,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
 
     fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
     fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
diff --git a/neural_speed/convert/convert_starcoder.py b/neural_speed/convert/convert_starcoder.py
index 327f88864..932be3f8e 100644
--- a/neural_speed/convert/convert_starcoder.py
+++ b/neural_speed/convert/convert_starcoder.py
@@ -112,6 +112,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("i", 0))
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
+    fout.write(struct.pack("f", 1.0))  # rope_factor
 
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
index 3ec194a1c..509c047ec 100644
--- a/neural_speed/core/ne_layers.c
+++ b/neural_speed/core/ne_layers.c
@@ -2980,7 +2980,7 @@ struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor*
 
 struct ne_tensor* ne_rope_impl(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
                                int prompt_size, bool inplace, int n_keep, struct ne_tensor* cossin, int* n_padding,
-                               bool padding_left, float freq_base) {
+                               bool padding_left, float freq_base, float freq_scale) {
   NE_ASSERT(n_past >= 0 || n_keep >= 0);
   NE_ASSERT(padding_left);
   bool is_node = false;
@@ -3020,7 +3020,9 @@ struct ne_tensor* ne_rope_impl(struct ne_context* ctx, struct ne_tensor* a, int
 
   ne_scratch_load(ctx);
 
-  ne_set_op_params(result, &freq_base, sizeof(freq_base));
+  float params[] = {freq_base, freq_scale};
+  ne_set_op_params(result, &params, sizeof(params));
+
   result->op = NE_OP_ROPE;
   result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
   result->src0 = a;
@@ -3031,18 +3033,20 @@ struct ne_tensor* ne_rope_impl(struct ne_context* ctx, struct ne_tensor* a, int
 }
 
 struct ne_tensor* ne_rope(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
-                          int prompt_size, float freq_base) {
-  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, false, -1, NULL, NULL, true, freq_base);
+                          int prompt_size, float freq_base, float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, false, -1, NULL, NULL, true, freq_base, freq_scale);
 }
 
 struct ne_tensor* ne_rope_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
-                                  int prompt_size, float freq_base) {
-  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, true, -1, NULL, NULL, true, freq_base);
+                                  int prompt_size, float freq_base, float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, true, -1, NULL, NULL, true, freq_base, freq_scale);
 }
 
 struct ne_tensor* ne_rope_shift_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_shift, int n_dims, int mode,
-                                        int prompt_size, int n_keep, struct ne_tensor* cossin, float freq_base) {
-  return ne_rope_impl(ctx, a, n_shift, n_dims, mode, prompt_size, true, n_keep, cossin, NULL, true, freq_base);
+                                        int prompt_size, int n_keep, struct ne_tensor* cossin, float freq_base,
+                                        float freq_scale) {
+  return ne_rope_impl(ctx, a, n_shift, n_dims, mode, prompt_size, true, n_keep, cossin, NULL, true, freq_base,
+                      freq_scale);
 }
 
 // ne_rope_back
@@ -3078,13 +3082,16 @@ struct ne_tensor* ne_rope_back(struct ne_context* ctx, struct ne_tensor* a, int
 }
 
 struct ne_tensor* ne_rope_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
-                                       int prompt_size, int* n_padding, float freq_base) {
-  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, false, -1, NULL, n_padding, true, freq_base);
+                                       int prompt_size, int* n_padding, float freq_base, float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, false, -1, NULL, n_padding, true, freq_base,
+                      freq_scale);
 }
 
 struct ne_tensor* ne_rope_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims,
-                                               int mode, int prompt_size, int* n_padding, float freq_base) {
-  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, true, -1, NULL, n_padding, true, freq_base);
+                                               int mode, int prompt_size, int* n_padding, float freq_base,
+                                               float freq_scale) {
+  return ne_rope_impl(ctx, a, n_past, n_dims, mode, prompt_size, true, -1, NULL, n_padding, true, freq_base,
+                      freq_scale);
 }
 
 // ne_alibi
@@ -7868,9 +7875,8 @@ static void ne_compute_forward_rope_f32(const struct ne_compute_params* params,
   NE_ASSERT(src1->type == NE_TYPE_I32);
   NE_ASSERT(ne_nelements(src1) == 5 + bs);  // 5 + bs params
 
-  float freq_base = 10000.0f;
-  memcpy(&freq_base, dst->op_params, sizeof(float));
-  static const float freq_scale = 1.0f;
+  const float freq_base = ((float*)(dst->op_params))[0];
+  const float freq_scale = 1 / ((float*)(dst->op_params))[1];
 
   const int64_t n_past = ((int32_t*)src1->data)[ROPE_NPAST_IDX];
   const int64_t n_dims = ((int32_t*)src1->data)[ROPE_NDIMS_IDX];
@@ -8044,7 +8050,10 @@ static void ne_compute_forward_rope_f16(const struct ne_compute_params* params,
   // row index used to determine which thread to use
   int ir = 0;
 
-  const float theta_scale = powf(10000.0, -2.0f / n_dims);
+  const float freq_base = ((float*)(dst->op_params))[0];
+  const float freq_scale = 1 / ((float*)(dst->op_params))[1];
+
+  const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
   const bool skip = mode & 1;
   const bool is_neox = mode & 2;
@@ -8054,7 +8063,7 @@ static void ne_compute_forward_rope_f16(const struct ne_compute_params* params,
   NE_ASSERT(("shift RoPE is only implemented for the vanilla mode", !is_shift || !(is_glm || is_neox || skip)));
 
   if (is_shift) {
-    float theta = n_past;
+    float theta = n_past * freq_scale;
     ne_fp16_t* cossin = (dst->opt[0] != NULL) ? dst->opt[0]->data : NULL;
     if (cossin == NULL) {
       cossin = malloc(ne0 * sizeof(ne_fp16_t));
@@ -8099,7 +8108,7 @@ static void ne_compute_forward_rope_f16(const struct ne_compute_params* params,
         if (ir++ < ir0) continue;
         if (ir > ir1) break;
 
-        float theta = (float)p;
+        float theta = freq_scale * (float)p;
 
         if (!is_neox) {
           for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
@@ -8172,12 +8181,13 @@ static void ne_compute_forward_rope_bestla(const struct ne_compute_params* param
   const int head_num = dst->ne[2];
   const int seq_len = dst->ne[1];
   const int head_size = dst->ne[0];
-
+  const float freq_base = ((float*)(dst->op_params))[0];
+  const float freq_scale = 1 / ((float*)(dst->op_params))[1];
   if (is_shift) {
     ne_fp16_t* cossin = (dst->opt[0] != NULL) ? dst->opt[0]->data : NULL;
     if (cossin == NULL) {
-      float theta = n_past;
-      const float theta_scale = powf(10000.0, -2.0f / n_dims);
+      float theta = n_past * freq_scale;
+      const float theta_scale = powf(freq_base, -2.0f / n_dims);
       cossin = malloc(head_size * sizeof(ne_fp16_t));
       for (int i0 = 0; i0 < head_size; i0 += 2) {
         cossin[i0 + 0] = NE_FP32_TO_FP16(cosf(theta));
@@ -10017,7 +10027,7 @@ static void ne_compute_backward(struct ne_context* ctx, struct ne_tensor* tensor
         const int n_dims = ((int32_t*)src1->data)[1];
         const int mode = ((int32_t*)src1->data)[2];
         src0->grad =
-            ne_add_impl(ctx, src0->grad, ne_rope(ctx, tensor->grad, n_past, n_dims, mode, 0, 10000.0), inplace);
+            ne_add_impl(ctx, src0->grad, ne_rope(ctx, tensor->grad, n_past, n_dims, mode, 0, 10000.0, 1.0), inplace);
       }
       if (src1->grad) {
         // noop
diff --git a/neural_speed/core/ne_layers.h b/neural_speed/core/ne_layers.h
index 88428b51e..d8bbf02d8 100644
--- a/neural_speed/core/ne_layers.h
+++ b/neural_speed/core/ne_layers.h
@@ -403,29 +403,30 @@ NE_API struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_t
 // if mode & 4 == 1, especially for glm
 // TODO: avoid creating a new tensor every time
 NE_API struct ne_tensor* ne_rope(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
-                                 int prompt_size, float freq_base);
+                                 int prompt_size, float freq_base, float freq_scale);
 
 // in-place, returns view(a)
 NE_API struct ne_tensor* ne_rope_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode,
-                                         int prompt_size, float freq_base);
+                                         int prompt_size, float freq_base, float freq_scale);
 
 // shift all tokens by a give p (n_shift)
 // Optionally give a 1d tensor of precomputed interleaved cos/sin value of n_shift*scale^k for k \in [0, n_dims)
 NE_API struct ne_tensor* ne_rope_shift_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_shift, int n_dims,
                                                int mode, int prompt_size, int n_keep, struct ne_tensor* cossin,
-                                               float freq_base);
+                                               float freq_base, float freq_scale);
 
 // rotary position embedding backward, i.e compute dx from dy
 // a - dy
 NE_API struct ne_tensor* ne_rope_back(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims, int mode);
 
 NE_API struct ne_tensor* ne_rope_with_padding(struct ne_context* ctx, struct ne_tensor* a, int n_past, int n_dims,
-                                              int mode, int prompt_size, int* n_padding, float freq_base);
+                                              int mode, int prompt_size, int* n_padding, float freq_base,
+                                              float freq_scale);
 
 // in-place, returns view(a)
 NE_API struct ne_tensor* ne_rope_with_padding_inplace(struct ne_context* ctx, struct ne_tensor* a, int n_past,
                                                       int n_dims, int mode, int prompt_size, int* n_padding,
-                                                      float freq_base);
+                                                      float freq_base, float freq_scale);
 
 // alibi position embedding
 // in-place, returns view(a)
diff --git a/neural_speed/models/chatglm/chatglm.cpp b/neural_speed/models/chatglm/chatglm.cpp
index f7f927af4..69c973b39 100644
--- a/neural_speed/models/chatglm/chatglm.cpp
+++ b/neural_speed/models/chatglm/chatglm.cpp
@@ -137,14 +137,14 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
 
       ne_set_name(query_layer, "query_layer");
       query_layer = ne_rope_with_padding_inplace(ctx0, query_layer, n_past, rope_dim, 4, first_tokens_size,
-                                                 n_padding.data(), hparams.freq_base);
+                                                 n_padding.data(), hparams.freq_base, hparams.freq_scale);
       query_layer = ne_permute(ctx0, query_layer, 0, 2, 1, 3);  // [bs, heads, qlen, head_size]
 
       ne_tensor* key_layer =
           ne_view_4d(ctx0, cur, head_size, num_attention_heads, qlen, batch_size, 3 * head_size * ne_element_size(cur),
                      cur->nb[1], cur->nb[1] * qlen, head_size * ne_element_size(cur));  // [bs, qlen, heads, head_size]
       key_layer = ne_rope_with_padding_inplace(ctx0, key_layer, n_past, rope_dim, 4, first_tokens_size,
-                                               n_padding.data(), hparams.freq_base);
+                                               n_padding.data(), hparams.freq_base, hparams.freq_scale);
 
       ne_tensor* value_layer = ne_view_4d(ctx0, cur, head_size, num_attention_heads, qlen, batch_size,
                                           3 * head_size * ne_element_size(cur), cur->nb[1], cur->nb[1] * qlen,
diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp
index 560452a15..59d1a39e1 100644
--- a/neural_speed/models/chatglm/chatglm2.cpp
+++ b/neural_speed/models/chatglm/chatglm2.cpp
@@ -146,14 +146,15 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
           ne_view_3d(ctx0, cur, head_size, n_head, N, head_size * ne_element_size(cur), cur->nb[1],
                      0);  // [N, heads, head_size]
       ne_set_name(query_layer, "query_layer");
-      query_layer = ne_rope_inplace(ctx0, query_layer, std::max(n_cached - N, n_past), n_rot, 0, 0, hparams.freq_base);
+      query_layer = ne_rope_inplace(ctx0, query_layer, std::max(n_cached - N, n_past), n_rot, 0, 0, hparams.freq_base,
+                                    hparams.freq_scale);
 
       struct ne_tensor* key_layer =
           ne_view_3d(ctx0, cur, head_size, num_kv_heads, N, head_size * ne_element_size(cur), cur->nb[1],
                      hidden_size * ne_element_size(cur));  // [N, kv_heads, head_size]
       ne_set_name(key_layer, "key_layer");
       key_layer = ne_rope_inplace(  // n_ctx exceeds but it will be shift-roped back with cached K
-          ctx0, key_layer, (is_ring_full ? n_ctx : n_past), n_rot, 0, 0, hparams.freq_base);
+          ctx0, key_layer, (is_ring_full ? n_ctx : n_past), n_rot, 0, 0, hparams.freq_base, hparams.freq_scale);
 
       struct ne_tensor* value_layer =
           ne_view_3d(ctx0, cur, head_size, num_kv_heads, N, head_size * ne_element_size(cur), cur->nb[1],
@@ -198,7 +199,8 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
           // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N
           // in a single eval execution
           if (N == 1) cossin_cache = kv_self.cossin;
-          key_layer = ne_rope_shift_inplace(ctx0, key_layer, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base);
+          key_layer = ne_rope_shift_inplace(ctx0, key_layer, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base,
+                                            hparams.freq_scale);
           key_layer = ne_permute(ctx0, key_layer, 0, 2, 1, 3);  // perm back
         }
 
@@ -253,7 +255,8 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
           // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N
           // in a single eval execution
           if (N == 1) cossin_cache = kv_self.cossin;
-          key_layer = ne_rope_shift_inplace(ctx0, key_layer, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base);
+          key_layer = ne_rope_shift_inplace(ctx0, key_layer, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base,
+                                            hparams.freq_scale);
         }
         value_layer =
             ne_view_3d(ctx0, model.layers[il].v_cache,                                      // tensor
diff --git a/neural_speed/models/falcon/falcon.cpp b/neural_speed/models/falcon/falcon.cpp
index 6b624e4bb..f45dabdb6 100644
--- a/neural_speed/models/falcon/falcon.cpp
+++ b/neural_speed/models/falcon/falcon.cpp
@@ -162,8 +162,8 @@ static bool falcon_model_eval_internal(model_context* ctx, const model_input* in
                                           fused_qkv_row_nb, (n_embd + n_head_kv * head_dim) * ne_element_size(cur));
 
       // using mode = 2 for neox mode
-      Qcur = ne_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0, hparams.freq_base);
-      Kcur = ne_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0, hparams.freq_base);
+      Qcur = ne_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0, hparams.freq_base, hparams.freq_scale);
+      Kcur = ne_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0, hparams.freq_base, hparams.freq_scale);
 
       // self-attention
       const float attn_scale = 1.0f / sqrtf(static_cast<float>(head_dim));
diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
index 1dde4ca38..25914885e 100644
--- a/neural_speed/models/gptj/gptj.cpp
+++ b/neural_speed/models/gptj/gptj.cpp
@@ -186,9 +186,10 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
       Kcur = ne_reshape_4d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), head_size, n_head, N, batch_size);
       Vcur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur);
     }
-    Qcur = ne_rope_inplace(ctx0, Qcur, std::max(n_cached - N, n_past), n_rot, 0, 0, hparams.freq_base);
+    Qcur =
+        ne_rope_inplace(ctx0, Qcur, std::max(n_cached - N, n_past), n_rot, 0, 0, hparams.freq_base, hparams.freq_scale);
     Kcur = ne_rope_inplace(  // n_ctx exceeds but it will be shift-roped back with cached K
-        ctx0, Kcur, (is_ring_full ? n_ctx : n_past), n_rot, 0, 0, hparams.freq_base);
+        ctx0, Kcur, (is_ring_full ? n_ctx : n_past), n_rot, 0, 0, hparams.freq_base, hparams.freq_scale);
     ne_set_name(Qcur, "Qcur");
     ne_set_name(Kcur, "Kcur");
     ne_set_name(Vcur, "Vcur");
@@ -292,8 +293,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
         struct ne_tensor* cossin_cache = nullptr;
         // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N
         // in a single eval execution
-        if (N == 1) cossin_cache = kv_self.cossin;
-        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base);
+        if (N == 1) {
+          cossin_cache = kv_self.cossin;
+        }
+        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base,
+                                  hparams.freq_scale);
       }
       const auto v_size = kv_cache_info.v_bytes;
       V = ne_view_4d(ctx0, kv_self.v,                                                            // tensor
@@ -320,8 +324,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
         struct ne_tensor* cossin_cache = nullptr;
         // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N in
         // a single eval execution
-        if (N == 1) cossin_cache = kv_self.cossin;
-        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base);
+        if (N == 1) {
+          cossin_cache = kv_self.cossin;
+        }
+        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base,
+                                  hparams.freq_scale);
         K = ne_permute(ctx0, K, 0, 2, 1, 3);
       }
     } else {
@@ -331,8 +338,11 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
         struct ne_tensor* cossin_cache = nullptr;
         // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N in
         // a single eval execution
-        if (N == 1) cossin_cache = kv_self.cossin;
-        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base);
+        if (N == 1) {
+          cossin_cache = kv_self.cossin;
+        }
+        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base,
+                                  hparams.freq_scale);
         K = ne_permute(ctx0, K, 0, 2, 1, 3);
       }
 
diff --git a/neural_speed/models/gptneox/gptneox.cpp b/neural_speed/models/gptneox/gptneox.cpp
index 80ec83d7a..4652e4664 100644
--- a/neural_speed/models/gptneox/gptneox.cpp
+++ b/neural_speed/models/gptneox/gptneox.cpp
@@ -188,9 +188,9 @@ static bool gptneox_model_eval_internal(model_context* ctx, const model_input* i
 
       // using mode = 2 for GPT-NeoX mode
       Qcur = ne_rope_inplace(ctx0, ne_reshape_4d(ctx0, Qcur, head_dim, n_head, N, batch_size), n_past, n_rot, 2, 0,
-                             hparams.freq_base);
+                             hparams.freq_base, hparams.freq_scale);
       Kcur = ne_rope_inplace(ctx0, ne_reshape_4d(ctx0, Kcur, head_dim, n_head, N, batch_size), n_past, n_rot, 2, 0,
-                             hparams.freq_base);
+                             hparams.freq_base, hparams.freq_scale);
       const float attn_scale = 1.0f / sqrtf(static_cast<float>(head_dim));
       // store key and value to memory
       if (!run_mha_reordered) {
diff --git a/neural_speed/models/llama/llama.cpp b/neural_speed/models/llama/llama.cpp
index 39cda575b..65673671f 100644
--- a/neural_speed/models/llama/llama.cpp
+++ b/neural_speed/models/llama/llama.cpp
@@ -187,10 +187,11 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
       Kcur = ne_reshape_3d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), head_size, n_head_kv, N);
       Vcur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur);
     }
-    Qcur = ne_rope_inplace(ctx0, Qcur, std::max(n_cached - N, n_past), n_rot, 0, 0, hparams.freq_base);
+    Qcur =
+        ne_rope_inplace(ctx0, Qcur, std::max(n_cached - N, n_past), n_rot, 0, 0, hparams.freq_base, hparams.freq_scale);
     ne_set_name(Qcur, "Qcur");
     Kcur = ne_rope_inplace(  // n_ctx exceeds but it will be shift-roped back with cached K
-        ctx0, Kcur, (is_ring_full ? n_ctx : n_past), n_rot, 0, 0, hparams.freq_base);
+        ctx0, Kcur, (is_ring_full ? n_ctx : n_past), n_rot, 0, 0, hparams.freq_base, hparams.freq_scale);
     ne_set_name(Kcur, "Kcur");
     Vcur = ne_transpose(ctx0, ne_reshape_2d(ctx0, Vcur, head_size * n_head_kv, N));
     ne_set_name(Vcur, "Vcur");
@@ -220,8 +221,11 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
         struct ne_tensor* cossin_cache = nullptr;
         // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N in
         // a single eval execution
-        if (N == 1) cossin_cache = kv_self.cossin;
-        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base);
+        if (N == 1) {
+          cossin_cache = kv_self.cossin;
+        }
+        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base,
+                                  hparams.freq_scale);
       }
       K = ne_permute(ctx0, K, 0, 2, 1, 3);
       ne_set_name(K, "K");
@@ -301,7 +305,8 @@ static bool llama_model_eval_internal(model_context* ctx, const model_input* inp
         // Currently we only cache cossin for N == 1 in model-wide; It may be worthwhile to cache cossin for other N in
         // a single eval execution
         if (N == 1) cossin_cache = kv_self.cossin;
-        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base);
+        K = ne_rope_shift_inplace(ctx0, K, -N, n_rot, 0, 0, n_keep, cossin_cache, hparams.freq_base,
+                                  hparams.freq_scale);
       }
       ne_set_name(K, "K");
 
diff --git a/neural_speed/models/mistral/CMakeLists.txt b/neural_speed/models/mistral/CMakeLists.txt
deleted file mode 100644
index dd894d1a2..000000000
--- a/neural_speed/models/mistral/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-#  Copyright (c) 2023 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-set(TARGET mistral)
-add_library_w_warning(${TARGET} mistral.cpp mistral_utils.cpp ${MODEL_UTILS_SOURCE})
-target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
-set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-target_link_libraries(${TARGET} PUBLIC ne_layers bestla::bestla)
diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h
index 30366104e..10b0c24cb 100644
--- a/neural_speed/models/model_utils/model_files.h
+++ b/neural_speed/models/model_utils/model_files.h
@@ -1043,6 +1043,7 @@ struct model_file_loader {
 
     file.read_raw(&hparams.rms_norm_eps, sizeof(float));
     file.read_raw(&hparams.freq_base, sizeof(float));
+    file.read_raw(&hparams.freq_scale, sizeof(float));
   }
 
   void read_ne_vocab() {
@@ -1162,6 +1163,7 @@ struct model_file_saver {
 
     file.write_raw(&hparams.rms_norm_eps, sizeof(float));
     file.write_raw(&hparams.freq_base, sizeof(float));
+    file.write_raw(&hparams.freq_scale, sizeof(float));
   }
   void write_vocab() {
     if (any_file_loader->file_version == MODEL_FILE_VERSION_NE) {
diff --git a/neural_speed/models/model_utils/model_types.h b/neural_speed/models/model_utils/model_types.h
index 3c05b168b..3ec457562 100644
--- a/neural_speed/models/model_utils/model_types.h
+++ b/neural_speed/models/model_utils/model_types.h
@@ -126,7 +126,8 @@ struct model_hparams {
   uint32_t word_embed_proj_dim = 0;   // for opt
   bool do_layer_norm_before = false;  // for opt
   float rms_norm_eps = 1e-6f;         // rms norm epsilon
-  float freq_base = 10000.0f;
+  float freq_base = 10000.0f;         // rope theta
+  float freq_scale = 1.0f;            // rope scale factor
 
   // ChatGLM-2
   int32_t multi_query_group_num = 0;
diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
index 63d84907f..08a83fc29 100644
--- a/neural_speed/models/model_utils/model_utils.cpp
+++ b/neural_speed/models/model_utils/model_utils.cpp
@@ -148,11 +148,12 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c
     const auto cossin_dtype = wtype == NE_TYPE_BTLA ? NE_TYPE_F16 : wtype;
     cache.cossin = ne_new_tensor_1d(cache.ctx, cossin_dtype, head_size, NE_SIZE_CALC);
     ne_set_name(cache.cossin, "cossin(-1)");
-    float theta = -1;
+    float freq_base = hparams.freq_base;
+    float theta = -1 * hparams.freq_scale;
     float theta_scale = (model != nullptr && model->arch == MODEL_CHATGLM2)
-                            ? std::pow(10000.f, -2.0f / (head_size / 2))  // chatglm2 has their DIM_SCALE of 2
-                        : hparams.n_rot > 0 ? std::pow(10000.f, -2.0f / hparams.n_rot)
-                                            : std::pow(10000.f, -2.0f / head_size);
+                            ? std::pow(freq_base, -2.0f / (head_size / 2))  // chatglm2 has their DIM_SCALE of 2
+                        : hparams.n_rot > 0 ? std::pow(freq_base, -2.0f / hparams.n_rot)
+                                            : std::pow(freq_base, -2.0f / head_size);
     if (cossin_dtype == NE_TYPE_F16) {
       const auto data = reinterpret_cast<ne_fp16_t*>(cache.cossin->data);
       for (int i = 0; i < head_size; i += 2) {
diff --git a/neural_speed/models/qwen/qwen.cpp b/neural_speed/models/qwen/qwen.cpp
index 28ce4bcd6..837c09021 100644
--- a/neural_speed/models/qwen/qwen.cpp
+++ b/neural_speed/models/qwen/qwen.cpp
@@ -180,9 +180,9 @@ static bool qwen_model_eval_internal(model_context* ctx, const model_input* inpu
                                                         fused_qkv_row_nb, 2 * sizeof(float) * n_embd));
 
       // using mode = 2 for GPT-NeoX mode
-      Qcur = ne_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0, hparams.freq_base);
+      Qcur = ne_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0, hparams.freq_base, hparams.freq_scale);
       ne_set_name(Qcur, "Qcur");
-      Kcur = ne_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0, hparams.freq_base);
+      Kcur = ne_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0, hparams.freq_base, hparams.freq_scale);
       ne_set_name(Kcur, "kcur");
       const float attn_scale = 1.0f / sqrtf(static_cast<float>(head_dim));
       // store key and value to memory
diff --git a/tests/model-test/cpp_graph_inference.sh b/tests/model-test/cpp_graph_inference.sh
index 42d835e23..e5168ddda 100644
--- a/tests/model-test/cpp_graph_inference.sh
+++ b/tests/model-test/cpp_graph_inference.sh
@@ -247,7 +247,7 @@ function main() {
         infer_cmd="./build/bin/run_qwen"
     elif [[ "${model}" == "magicoder" ]]; then
         quant_script="./build/bin/quant_llama"
-        convert_script="${convert_script}/convert_bmagicoder.py"
+        convert_script="${convert_script}/convert_llama.py"
         infer_cmd="./build/bin/run_llama"
     elif [[ "${model}" == "whisper" ]]; then
         quant_script="./build/bin/quant_whisper"