QwenLM · fann1993814 · Nov 15, 2023 · Nov 15, 2023
diff --git a/main.cpp b/main.cpp
@@ -112,7 +112,7 @@ static auto get_utf8_line(std::string &line) -> bool {
 static auto chat(Args &args) -> void {
   ggml_time_init();
   int64_t start_load_us = ggml_time_us();
-  qwen::Pipeline pipeline(args.model_path, args.tiktoken_path);
+  qwen::Pipeline pipeline(args.model_path, args.tiktoken_path, args.max_length);
   int64_t end_load_us = ggml_time_us();
 
   std::string model_name = "qwen";

diff --git a/qwen.cpp b/qwen.cpp
@@ -505,11 +505,12 @@ auto get_default_num_threads() -> int {
 
 QwenForCausalLM::QwenForCausalLM(const QwenConfig &config)
   : config(config) {
-  ctx_.compute_buffer.resize(MEM_SIZE);
-  ctx_.scratch_buffer.resize(SCRATCH_SIZE);
+  const float scale = config.max_length / 2048.0; // default (MEM_SIZE and SCRATCH_SIZE) setting is for 2k context, so depend on max_length to scale it
+  ctx_.compute_buffer.resize(static_cast<size_t>(MEM_SIZE * scale));
+  ctx_.scratch_buffer.resize(static_cast<size_t>(SCRATCH_SIZE * scale));
   ctx_.scratch = {0, ctx_.scratch_buffer.size(), ctx_.scratch_buffer.data()};
 #ifdef GGML_USE_CUBLAS
-  ggml_cuda_set_scratch_size(SCRATCH_SIZE);
+  ggml_cuda_set_scratch_size(static_cast<size_t>(SCRATCH_SIZE * scale));
 #endif
   constexpr size_t tensor_ovhd = GGML_TENSOR_SIZE + GGML_OBJECT_SIZE;
   const size_t ctx_w_size = (3 + config.num_hidden_layers * 8) * tensor_ovhd;
@@ -803,7 +804,7 @@ auto QwenForCausalLM::forward(
 
 // ===== pipeline =====
 
-Pipeline::Pipeline(const std::string &path, const std::string &tiktoken_path) {
+Pipeline::Pipeline(const std::string &path, const std::string &tiktoken_path, const int max_length) {
   mapped_file = std::make_unique<MappedFile>(path);
   ModelLoader loader(std::string_view((char *)mapped_file->data, mapped_file->size));
 
@@ -813,6 +814,11 @@ Pipeline::Pipeline(const std::string &path, const std::string &tiktoken_path) {
 
   // load config
   QwenConfig config = loader.read_basic<QwenConfig>();
+
+  // modify max length from external setting
+  if (max_length > 0 && max_length < config.max_length) {
+    config.max_length = max_length;
+  }
 
   // load model
   model = std::make_unique<QwenForCausalLM>(config);

diff --git a/qwen.h b/qwen.h
@@ -418,7 +418,7 @@ class QwenForCausalLM {
 
 class Pipeline {
   public:
-    Pipeline(const std::string &path, const std::string &tiktoken_path);
+    Pipeline(const std::string &path, const std::string &tiktoken_path, const int max_length = 0);
 
     auto generate(const std::vector<int> &input_ids, const GenerationConfig &gen_config,
                   BaseStreamer *streamer = nullptr) const -> std::vector<int>;

diff --git a/qwen_cpp/__init__.py b/qwen_cpp/__init__.py
@@ -7,10 +7,10 @@
 
 class Pipeline(_C.Pipeline):
     def __init__(
-        self, model_path: str, tiktoken_path: str, *, dtype: Optional[str] = None
+        self, model_path: str, tiktoken_path: str, *, max_length: int = 2048, dtype: Optional[str] = None
     ) -> None:
         if Path(model_path).is_file() and Path(tiktoken_path).is_file():
-            super().__init__(str(model_path), str(tiktoken_path))
+            super().__init__(str(model_path), str(tiktoken_path), int(max_length))
         else:
             from qwen_cpp.convert import convert
 

diff --git a/qwen_pybind.cpp b/qwen_pybind.cpp
@@ -55,7 +55,7 @@ PYBIND11_MODULE(_C, m) {
     .def_readwrite("num_threads", &GenerationConfig::num_threads);
 
   py::class_<Pipeline>(m, "Pipeline")
-    .def(py::init<const std::string &, const std::string &>())
+    .def(py::init<const std::string &, const std::string &, const int>())
     .def_property_readonly("model", [](const Pipeline &self) { return self.model.get(); })
     .def_property_readonly("tokenizer", [](const Pipeline &self) { return self.tokenizer.get(); });
 }