From f2c111c2cd17c8fe4c9a140e371e193879fef2ab Mon Sep 17 00:00:00 2001 From: Keisuke Kamahori Date: Sun, 28 Apr 2024 23:00:50 +0000 Subject: [PATCH] Fix non-blocking transfer to CPU --- src/fiddler/mixtral.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/fiddler/mixtral.py b/src/fiddler/mixtral.py index df40b6a..4da03c3 100644 --- a/src/fiddler/mixtral.py +++ b/src/fiddler/mixtral.py @@ -543,9 +543,7 @@ def mixtral_forward(self, input_ids, position_ids, is_decode): ) if not is_cuda: - experts[i_expert] = experts[i_expert].to( - "cpu", non_blocking=True - ) + experts[i_expert] = experts[i_expert].to("cpu") # end of one expert @@ -629,10 +627,8 @@ def mixtral_forward(self, input_ids, position_ids, is_decode): current_state = self.run_expert_at_cpu( i_layer, i_expert, - current_state.to("cpu", non_blocking=True), - routing_weights[top_2_list, idx_list, None].to( - "cpu", non_blocking=True - ), + current_state.to("cpu"), + routing_weights[top_2_list, idx_list, None].to("cpu"), ) inps_after_experts.index_add_( 0,