From bd5d2e459c251c59041b838f2b235277dd031651 Mon Sep 17 00:00:00 2001 From: jiwaszki Date: Mon, 11 Sep 2023 13:50:41 +0000 Subject: [PATCH 1/2] Update sharing memory flags for OpenVINO --- optimum/intel/openvino/modeling_decoder.py | 7 +++---- optimum/intel/openvino/modeling_diffusion.py | 8 ++++---- optimum/intel/openvino/modeling_seq2seq.py | 9 ++++----- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index b5d4f0be5d..28c839f231 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -366,14 +366,13 @@ def forward( inputs["attention_mask"] = np.array(attention_mask) # Run inference - self.request.start_async(inputs, shared_memory=True) - self.request.wait() + results = self.request.infer(inputs, share_inputs=True, share_outputs=True) - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + logits = torch.from_numpy(results["logits"]).to(self.device) if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + past_key_values = tuple(results[key] for key in self.key_value_output_names) # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 1085c9e81c..b1679595d3 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -556,7 +556,7 @@ def __call__(self, input_ids: np.ndarray): inputs = { "input_ids": input_ids, } - outputs = self.request(inputs, shared_memory=True) + outputs = self.request(inputs, share_inputs=True, share_outputs=True) return list(outputs.values()) @@ -587,7 +587,7 @@ def __call__( if time_ids is not None: inputs["time_ids"] = time_ids - outputs = self.request(inputs, shared_memory=True) + outputs = self.request(inputs, share_inputs=True, share_outputs=True) return list(outputs.values()) @@ -603,7 +603,7 @@ def __call__(self, latent_sample: np.ndarray): inputs = { "latent_sample": latent_sample, } - outputs = self.request(inputs, shared_memory=True) + outputs = self.request(inputs, share_inputs=True, share_outputs=True) return list(outputs.values()) @@ -619,7 +619,7 @@ def __call__(self, sample: np.ndarray): inputs = { "sample": sample, } - outputs = self.request(inputs, shared_memory=True) + outputs = self.request(inputs, share_inputs=True, share_outputs=True) return list(outputs.values()) diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 0f52335639..c56ef632e6 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -336,7 +336,7 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - last_hidden_state = torch.from_numpy(self.request(inputs, shared_memory=True)["last_hidden_state"]).to( + last_hidden_state = torch.from_numpy(self.request(inputs, share_inputs=True)["last_hidden_state"]).to( self.device ) @@ -414,13 +414,12 @@ def forward( inputs["encoder_hidden_states"] = encoder_hidden_states # Run inference - self.request.start_async(inputs, shared_memory=True) - self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + results = self.request.infer(inputs, share_inputs=True, share_outputs=True) + logits = torch.from_numpy(results["logits"]).to(self.device) # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the # self-attention layer and 2 to the cross-attention layer) - out_past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + out_past_key_values = tuple(results[key] for key in self.key_value_output_names) # Tuple of tuple of length `n_layers`, with each tuple of length equal to: # * 4 for the decoder without cache (k/v of self-attention + k/v of cross-attention) From 1eb70cc9b10b6df5c83f718c9d009f28d63ef5ca Mon Sep 17 00:00:00 2001 From: jiwaszki Date: Tue, 19 Sep 2023 14:01:11 +0000 Subject: [PATCH 2/2] Update OpenVINO runtime and dev versions --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 769431c31c..0034ec2560 100644 --- a/setup.py +++ b/setup.py @@ -42,8 +42,8 @@ "onnx", "onnxruntime<1.15.0", ], - "openvino": ["openvino>=2023.0.0", "onnx", "onnxruntime"], - "nncf": ["nncf>=2.5.0", "openvino-dev>=2023.0.0"], + "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime"], + "nncf": ["nncf>=2.5.0", "openvino-dev>=2023.1.0"], "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE,