menloresearch · vansangpfiev · Apr 4, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md
@@ -63,8 +63,14 @@ tools = [
 
 completion_payload = {
     "messages": [
-        {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."},
-        {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"},
+        {
+            "role": "system",
+            "content": 'You have access to the following CUSTOM functions:\n\n<CUSTOM_FUNCTIONS>\n\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => `<function`\nparameters => a JSON dict with the function argument name as key and function argument value as value.\nend_tag => `</function>`\n\nHere is an example,\n<function=example_function_name>{"example_name": "example_value"}</function>\n\nReminder:\n- Function calls MUST follow the specified format\n- Required parameters MUST be specified\n- You can call one or more functions at a time, but remember only chose correct function\n- Put the entire function call reply on one line\n- Always add your sources when using search results to answer the user query\n- If you can not find correct parameters or arguments corresponding to function in the user\'s message, ask user again to provide, do not make assumptions.\n- No explanation are needed when calling a function.\n\nYou are a helpful assistant.',
+        },
+        {
+            "role": "user", 
+            "content": "Hi, can you tell me the delivery date for my order?"
+        },
     ]
 }
 
@@ -126,10 +132,22 @@ Once the user provides their order ID:
 ```python
 completion_payload = {
     "messages": [
-        {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."},
-        {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"},
-        {"role": "assistant", "content": "Of course! Please provide your order ID so I can look it up."},
-        {"role": "user", "content": "i think it is order_70705"},
+        {
+            "role": "system",
+            "content": 'You have access to the following CUSTOM functions:\n\n<CUSTOM_FUNCTIONS>\n\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => `<function`\nparameters => a JSON dict with the function argument name as key and function argument value as value.\nend_tag => `</function>`\n\nHere is an example,\n<function=example_function_name>{"example_name": "example_value"}</function>\n\nReminder:\n- Function calls MUST follow the specified format\n- Required parameters MUST be specified\n- You can call one or more functions at a time, but remember only chose correct function\n- Put the entire function call reply on one line\n- Always add your sources when using search results to answer the user query\n- If you can not find correct parameters or arguments corresponding to function in the user\'s message, ask user again to provide, do not make assumptions.\n- No explanation are needed when calling a function.\n\nYou are a helpful assistant.',
+        },
+        {
+            "role": "user", 
+            "content": "Hi, can you tell me the delivery date for my order?"
+        },
+        {
+            "role": "assistant", 
+            "content": "Of course! Please provide your order ID so I can look it up."
+        },
+        {
+            "role": "user", 
+            "content": "i think it is order_70705"
+        },
     ]
 }
 

diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
@@ -179,7 +179,6 @@ void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
 void server::ProcessNonStreamRes(std::function<void(const HttpResponsePtr&)> cb,
                                  SyncQueue& q) {
   auto [status, res] = q.wait_and_pop();
-  function_calling_utils::PostProcessResponse(res);
   LOG_DEBUG << "response: " << res.toStyledString();
   auto resp = cortex_utils::CreateCortexHttpJsonResponse(res);
   resp->setStatusCode(

diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc
@@ -544,6 +544,7 @@ void LocalEngine::LoadModel(std::shared_ptr<Json::Value> json_body,
 
   params.push_back("--pooling");
   params.push_back("mean");
+  params.push_back("--jinja");
 
   std::vector<std::string> v;
   v.reserve(params.size() + 1);

diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
@@ -13,8 +13,6 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     engine_type = (*(json_body)).get("engine", kLlamaRepo).asString();
   }
   CTL_DBG("engine_type: " << engine_type);
-  function_calling_utils::PreprocessRequest(json_body);
-  CTL_DBG("engine_type: " << engine_type);
   auto tool_choice = json_body->get("tool_choice", Json::Value::null);
   auto model_id = json_body->get("model", "").asString();
   if (saved_models_.find(model_id) != saved_models_.end()) {
@@ -46,51 +44,6 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     return cpp::fail(std::make_pair(stt, res));
   }
 
-  if (!model_id.empty()) {
-    if (auto model_service = model_service_.lock()) {
-      auto metadata_ptr = model_service->GetCachedModelMetadata(model_id);
-      if (metadata_ptr != nullptr &&
-          !metadata_ptr->tokenizer->chat_template.empty()) {
-        auto tokenizer = metadata_ptr->tokenizer;
-        auto messages = (*json_body)["messages"];
-        Json::Value messages_jsoncpp(Json::arrayValue);
-        for (auto message : messages) {
-          messages_jsoncpp.append(message);
-        }
-
-        Json::Value tools(Json::arrayValue);
-        Json::Value template_data_json;
-        template_data_json["messages"] = messages_jsoncpp;
-        // template_data_json["tools"] = tools;
-
-        auto prompt_result = jinja::RenderTemplate(
-            tokenizer->chat_template, template_data_json, tokenizer->bos_token,
-            tokenizer->eos_token, tokenizer->add_bos_token,
-            tokenizer->add_eos_token, tokenizer->add_generation_prompt);
-        if (prompt_result.has_value()) {
-          (*json_body)["prompt"] = prompt_result.value();
-          if (json_body->isMember("stop")) {
-            bool need_append = true;
-            for (auto& s : (*json_body)["stop"]) {
-              if (s.asString() == tokenizer->eos_token) {
-                need_append = false;
-              }
-            }
-            if (need_append) {
-              (*json_body)["stop"].append(tokenizer->eos_token);
-            }
-          } else {
-            Json::Value stops(Json::arrayValue);
-            stops.append(tokenizer->eos_token);
-            (*json_body)["stop"] = stops;
-          }
-        } else {
-          CTL_ERR("Failed to render prompt: " + prompt_result.error());
-        }
-      }
-    }
-  }
-
   CTL_DBG("Json body inference: " + json_body->toStyledString());
 
   auto cb = [q, tool_choice](Json::Value status, Json::Value res) {

diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
@@ -691,21 +691,7 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
     auto status = std::get<0>(ir)["status_code"].asInt();
     auto data = std::get<1>(ir);
 
-    if (status == drogon::k200OK) {
-      // start model successfully, in case not vision model, we store the metadata so we can use
-      // for each inference
-      if (!json_data.isMember("mmproj") || json_data["mmproj"].isNull()) {
-        auto metadata_res = GetModelMetadata(model_handle);
-        if (metadata_res.has_value()) {
-          loaded_model_metadata_map_.emplace(model_handle,
-                                             std::move(metadata_res.value()));
-          CTL_INF("Successfully stored metadata for model " << model_handle);
-        } else {
-          CTL_WRN("Failed to get metadata for model " << model_handle << ": "
-                                                      << metadata_res.error());
-        }
-      }
-
+    if (status == drogon::k200OK) {      
       return StartModelResult{/* .success = */ true,
                               /* .warning = */ may_fallback_res.value()};
     } else if (status == drogon::k409Conflict) {
@@ -760,8 +746,6 @@ cpp::result<bool, std::string> ModelService::StopModel(
       if (bypass_check) {
         bypass_stop_check_set_.erase(model_handle);
       }
-      loaded_model_metadata_map_.erase(model_handle);
-      CTL_INF("Removed metadata for model " << model_handle);
       return true;
     } else {
       CTL_ERR("Model failed to stop with status code: " << status);
@@ -1090,14 +1074,6 @@ ModelService::GetModelMetadata(const std::string& model_id) const {
   return std::move(*model_metadata_res);
 }
 
-std::shared_ptr<ModelMetadata> ModelService::GetCachedModelMetadata(
-    const std::string& model_id) const {
-  if (loaded_model_metadata_map_.find(model_id) ==
-      loaded_model_metadata_map_.end())
-    return nullptr;
-  return loaded_model_metadata_map_.at(model_id);
-}
-
 std::string ModelService::GetEngineByModelId(
     const std::string& model_id) const {
   namespace fs = std::filesystem;

diff --git a/engine/services/model_service.h b/engine/services/model_service.h
@@ -83,9 +83,6 @@ class ModelService {
   cpp::result<std::shared_ptr<ModelMetadata>, std::string> GetModelMetadata(
       const std::string& model_id) const;
 
-  std::shared_ptr<ModelMetadata> GetCachedModelMetadata(
-      const std::string& model_id) const;
-
   std::string GetEngineByModelId(const std::string& model_id) const;
 
  private:
@@ -104,12 +101,6 @@ class ModelService {
   std::unordered_set<std::string> bypass_stop_check_set_;
   std::shared_ptr<EngineServiceI> engine_svc_ = nullptr;
 
-  /**
-   * Store the chat template of loaded model.
-   */
-  std::unordered_map<std::string, std::shared_ptr<ModelMetadata>>
-      loaded_model_metadata_map_;
-
   std::mutex es_mtx_;
   std::unordered_map<std::string, std::optional<hardware::Estimation>> es_;
   cortex::TaskQueue& task_queue_;

diff --git a/engine/test/components/test_function_calling.cc b/engine/test/components/test_function_calling.cc