From a08847095d0b7a343a5c987859f808ce002cb467 Mon Sep 17 00:00:00 2001
From: dan nelson <dan.nelson8@gmail.com>
Date: Thu, 11 May 2023 18:46:25 +0000
Subject: [PATCH] adding containerized cog inference

---
 README.md  |  2 +-
 cog.yaml   | 36 ++++++++++++++++++++++++++++
 predict.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 cog.yaml
 create mode 100644 predict.py
diff --git a/README.md b/README.md
index 028fa988..dbc2b2a8 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Ishan Misra*
 
 To appear at CVPR 2023 (*Highlighted paper*)
 
-[[`Paper`](https://facebookresearch.github.io/ImageBind/paper)] [[`Blog`](https://ai.facebook.com/blog/imagebind-six-modalities-binding-ai/)] [[`Demo`](https://imagebind.metademolab.com/)] [[`Supplementary Video`](https://dl.fbaipublicfiles.com/imagebind/imagebind_video.mp4)] [[`BibTex`](#citing-imagebind)]
+[[`Paper`](https://facebookresearch.github.io/ImageBind/paper)] [[`Blog`](https://ai.facebook.com/blog/imagebind-six-modalities-binding-ai/)] [[`Demo`](https://imagebind.metademolab.com/)] [[`Supplementary Video`](https://dl.fbaipublicfiles.com/imagebind/imagebind_video.mp4)] [[`BibTex`](#citing-imagebind)]  [[`Replicate Demo`](https://replicate.com/daanelson/imagebind)]
 
 PyTorch implementation and pretrained models for ImageBind. For details, see the paper: **[ImageBind: One Embedding Space To Bind Them All](https://facebookresearch.github.io/ImageBind/paper)**.
 
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 00000000..e4452b2f
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,36 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+  cuda: "11.6"
+
+  # a list of ubuntu apt packages to install
+  # system_packages:
+    # - "libgl1-mesa-glx"
+    # - "libglib2.0-0"
+
+  # python version in the form '3.8' or '3.8.12'
+  python_version: "3.9"
+
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - "torch==1.13"
+    - "torchvision==0.14.0"
+    - "torchaudio==0.13.0"
+    - "pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d"
+    - "timm==0.6.7"
+    - "ftfy"
+    - "regex"
+    - "einops"
+    - "fvcore"
+    - "decord==0.6.0"
+  
+  # commands run after the environment is setup
+  # run:
+    # - "echo env is ready!"
+    # - "echo another command if needed"
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
new file mode 100644
index 00000000..47c8dcdd
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,69 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+
+from typing import List, Optional
+from cog import BasePredictor, Input, Path
+import data
+import torch
+from models import imagebind_model
+from models.imagebind_model import ModalityType
+
+MODALITY_TO_PREPROCESSING = {
+    ModalityType.TEXT: data.load_and_transform_text,
+    ModalityType.VISION: data.load_and_transform_vision_data,
+    ModalityType.AUDIO: data.load_and_transform_audio_data,
+}
+
+
+class Predictor(BasePredictor):
+    def setup(self):
+        """Load the model into memory to make running multiple predictions efficient"""
+        model = imagebind_model.imagebind_huge(pretrained=True)
+        model.eval()
+        self.model = model.to("cuda")
+
+    def predict(
+        self,
+        input: Path = Input(
+            description="file that you want to embed. Needs to be text, vision, or audio.",
+            default=None,
+        ),
+        text_input: str = Input(
+            description="text that you want to embed. Provide a string here instead of a text file to input if you'd like.",
+            default=None,
+        ),
+        modality: str = Input(
+            description="modality of the input you'd like to embed",
+            choices=list(MODALITY_TO_PREPROCESSING.keys()),
+            default=ModalityType.VISION,
+        ),
+    ) -> List[float]:
+        """Infer a single embedding with the model"""
+
+        if not input and not text_input:
+            raise Exception(
+                "Neither input nor text_input were provided! Provide one in order to generate an embedding"
+            )
+
+        modality_function = MODALITY_TO_PREPROCESSING[modality]
+
+        if modality == "text":
+            if input and text_input:
+                raise Exception(
+                    f"Input and text_input were both provided! Only provide one to generate an embedding.\nInput provided: {input}\nText Input provided: {text_input}"
+                )
+            if text_input:
+                input = text_input
+            else:
+                with open(input, "r") as f:
+                    text_input = f.readlines()
+                input = text_input
+
+        device = "cuda"
+        model_input = {modality: modality_function([input], device)}
+
+        with torch.no_grad():
+            embeddings = self.model(model_input)
+        # print(type(embeddings))
+        emb = embeddings[modality]
+        return emb.cpu().squeeze().tolist()