From f8ba216b153d3ca9f9dea7e1f782622dc5b8e285 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Mon, 23 Oct 2023 19:41:48 +0200 Subject: [PATCH] fix --- optimum/intel/openvino/modeling_decoder.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 8af37de8a4..0e018f9f62 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -404,12 +404,13 @@ def forward( ) # Run inference - results = self.request.infer(inputs, share_inputs=True, share_outputs=True) - logits = torch.from_numpy(results["logits"]).to(self.device) + self.request.start_async(inputs, shared_memory=True) + self.request.wait() + logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(results[key] for key in self.key_value_output_names) + past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple(