Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
echarlaix committed Oct 23, 2023
1 parent 36d482e commit f8ba216
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,12 +404,13 @@ def forward(
)

# Run inference
results = self.request.infer(inputs, share_inputs=True, share_outputs=True)
logits = torch.from_numpy(results["logits"]).to(self.device)
self.request.start_async(inputs, shared_memory=True)
self.request.wait()
logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)

if self.use_cache:
# Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer)
past_key_values = tuple(results[key] for key in self.key_value_output_names)
past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
# Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention)
past_key_values = tuple(
Expand Down

0 comments on commit f8ba216

Please sign in to comment.