opea-project · XinyaoWa · Feb 25, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 28, 2025
@@ -195,6 +195,7 @@ class ChatCompletionRequest(BaseModel):
     # top_p: Optional[float] = None # Priority use openai
     typical_p: Optional[float] = None
     # repetition_penalty: Optional[float] = None
+    timeout: Optional[int] = None
 
     # doc: begin-chat-completion-extra-params
     echo: Optional[bool] = Field(

@@ -133,6 +133,8 @@ curl http://${your_ip}:9000/v1/docsum \
 
 "summary_type" is set to be "auto" by default, in this mode we will check input token length, if it exceed `MAX_INPUT_TOKENS`, `summary_type` will automatically be set to `refine` mode, otherwise will be set to `stuff` mode.
 
+With long contexts, request may get canceled due to its generation taking longer than the default `timeout` value (120s for TGI). Increase it as needed.
+
 **summary_type=stuff**
 
 In this mode LLM generate summary based on complete input text. In this case please carefully set `MAX_INPUT_TOKENS` and `MAX_TOTAL_TOKENS` according to your model and device memory, otherwise it may exceed LLM context limit and raise error when meet long context.
@@ -157,7 +159,7 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.ma
 ```bash
 curl http://${your_ip}:9000/v1/docsum \
   -X POST \
-  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
+  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}' \
   -H 'Content-Type: application/json'
 ```
 
@@ -170,6 +172,6 @@ In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - 2 * inpu
 ```bash
 curl http://${your_ip}:9000/v1/docsum \
   -X POST \
-  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}' \
+  -d '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}' \
   -H 'Content-Type: application/json'
 ```
@@ -70,6 +70,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):
             temperature=input.temperature if input.temperature else 0.01,
             repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,
             streaming=input.stream,
+            timeout=input.timeout if input.timeout is not None else 120,
             server_kwargs=server_kwargs,
             task="text-generation",
         )

@@ -63,6 +63,7 @@ async def invoke(self, input: DocSumChatCompletionRequest):
             top_p=input.top_p if input.top_p else 0.95,
             streaming=input.stream,
             temperature=input.temperature if input.temperature else 0.01,
+            request_timeout=float(input.timeout) if input.timeout is not None else None,
         )
         result = await self.generate(input, self.client)
 

@@ -67,6 +67,7 @@ async def invoke(self, input: ChatCompletionRequest):
             temperature=input.temperature if input.temperature else 0.01,
             repetition_penalty=input.repetition_penalty if input.repetition_penalty else 1.03,
             streaming=input.stream,
+            timeout=input.timeout if input.timeout is not None else 120,
             server_kwargs=server_kwargs,
         )
         result = await self.generate(input, self.client)

@@ -60,6 +60,7 @@ async def invoke(self, input: ChatCompletionRequest):
             top_p=input.top_p if input.top_p else 0.95,
             streaming=input.stream,
             temperature=input.temperature if input.temperature else 0.01,
+            request_timeout=float(input.timeout) if input.timeout is not None else None,
         )
         result = await self.generate(input, self.client)
 

@@ -125,15 +125,15 @@ function validate_microservices() {
         'text' \
         "docsum-tgi" \
         "docsum-tgi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-tgi" \
         "docsum-tgi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {

@@ -126,15 +126,15 @@ function validate_microservices() {
         'text' \
         "docsum-tgi-gaudi" \
         "docsum-tgi-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-tgi-gaudi" \
         "docsum-tgi-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {

@@ -140,15 +140,15 @@ function validate_microservices() {
         'text' \
         "docsum-vllm" \
         "docsum-vllm" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-vllm" \
         "docsum-vllm" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {

@@ -139,15 +139,15 @@ function validate_microservices() {
         'text' \
         "docsum-vllm-gaudi" \
         "docsum-vllm-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false, "timeout":200}'
 
     echo "Validate refine mode..."
     validate_services \
         "$URL" \
         'text' \
         "docsum-vllm-gaudi" \
         "docsum-vllm-gaudi" \
-        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000}'
+        '{"messages":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "refine", "chunk_size": 2000, "timeout":200}'
 }
 
 function stop_docker() {