From b1baa0a73d32f28650db885e6acace6f6ce45b49 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Mon, 19 Aug 2024 18:59:07 +0000 Subject: [PATCH 1/2] Replace .norm() with decomposed version for executorch export --- src/transformers/models/clip/modeling_clip.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index ee85fe3125873b..d5f12c9fe413be 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -67,6 +67,17 @@ def clip_loss(similarity: torch.Tensor) -> torch.Tensor: return (caption_loss + image_loss) / 2.0 +def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor: + """ + This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make + model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566 + """ + square_tensor = torch.pow(tensor, 2) + sum_tensor = torch.sum(square_tensor, dim=-1, keepdim=True) + normed_tensor = torch.pow(sum_tensor, 0.5) + return normed_tensor + + @dataclass class CLIPVisionModelOutput(ModelOutput): """ @@ -1313,8 +1324,8 @@ def forward( text_embeds = self.text_projection(text_embeds) # normalized features - image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) - text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + image_embeds = image_embeds / _get_vector_norm(image_embeds) + text_embeds = text_embeds / _get_vector_norm(text_embeds) # cosine similarity as logits logit_scale = self.logit_scale.exp() From 07b64382bd3f31505a4bd2f389ba98e59cecfc10 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Mon, 19 Aug 2024 19:10:04 +0000 Subject: [PATCH 2/2] [run_slow] clip