From fa26f43005a2ed5912c0dc3ec49e35722f2d9913 Mon Sep 17 00:00:00 2001 From: dan nelson Date: Thu, 11 May 2023 18:46:25 +0000 Subject: [PATCH] adding containerized cog inference --- README.md | 2 +- cog.yaml | 36 ++++++++++++++++++++++++++++ predict.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 cog.yaml create mode 100644 predict.py diff --git a/README.md b/README.md index 028fa988..dbc2b2a8 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Ishan Misra* To appear at CVPR 2023 (*Highlighted paper*) -[[`Paper`](https://facebookresearch.github.io/ImageBind/paper)] [[`Blog`](https://ai.facebook.com/blog/imagebind-six-modalities-binding-ai/)] [[`Demo`](https://imagebind.metademolab.com/)] [[`Supplementary Video`](https://dl.fbaipublicfiles.com/imagebind/imagebind_video.mp4)] [[`BibTex`](#citing-imagebind)] +[[`Paper`](https://facebookresearch.github.io/ImageBind/paper)] [[`Blog`](https://ai.facebook.com/blog/imagebind-six-modalities-binding-ai/)] [[`Demo`](https://imagebind.metademolab.com/)] [[`Supplementary Video`](https://dl.fbaipublicfiles.com/imagebind/imagebind_video.mp4)] [[`BibTex`](#citing-imagebind)] [[`Replicate Demo`](https://replicate.com/daanelson/imagebind)] PyTorch implementation and pretrained models for ImageBind. For details, see the paper: **[ImageBind: One Embedding Space To Bind Them All](https://facebookresearch.github.io/ImageBind/paper)**. diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 00000000..e4452b2f --- /dev/null +++ b/cog.yaml @@ -0,0 +1,36 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + # set to true if your model requires a GPU + gpu: true + cuda: "11.6" + + # a list of ubuntu apt packages to install + # system_packages: + # - "libgl1-mesa-glx" + # - "libglib2.0-0" + + # python version in the form '3.8' or '3.8.12' + python_version: "3.9" + + # a list of packages in the format == + python_packages: + - "torch==1.13" + - "torchvision==0.14.0" + - "torchaudio==0.13.0" + - "pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d" + - "timm==0.6.7" + - "ftfy" + - "regex" + - "einops" + - "fvcore" + - "decord==0.6.0" + + # commands run after the environment is setup + # run: + # - "echo env is ready!" + # - "echo another command if needed" + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 00000000..47c8dcdd --- /dev/null +++ b/predict.py @@ -0,0 +1,69 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +from typing import List, Optional +from cog import BasePredictor, Input, Path +import data +import torch +from models import imagebind_model +from models.imagebind_model import ModalityType + +MODALITY_TO_PREPROCESSING = { + ModalityType.TEXT: data.load_and_transform_text, + ModalityType.VISION: data.load_and_transform_vision_data, + ModalityType.AUDIO: data.load_and_transform_audio_data, +} + + +class Predictor(BasePredictor): + def setup(self): + """Load the model into memory to make running multiple predictions efficient""" + model = imagebind_model.imagebind_huge(pretrained=True) + model.eval() + self.model = model.to("cuda") + + def predict( + self, + input: Path = Input( + description="file that you want to embed. Needs to be text, vision, or audio.", + default=None, + ), + text_input: str = Input( + description="text that you want to embed. Provide a string here instead of a text file to input if you'd like.", + default=None, + ), + modality: str = Input( + description="modality of the input you'd like to embed", + choices=list(MODALITY_TO_PREPROCESSING.keys()), + default=ModalityType.VISION, + ), + ) -> List[float]: + """Infer a single embedding with the model""" + + if not input and not text_input: + raise Exception( + "Neither input nor text_input were provided! Provide one in order to generate an embedding" + ) + + modality_function = MODALITY_TO_PREPROCESSING[modality] + + if modality == "text": + if input and text_input: + raise Exception( + f"Input and text_input were both provided! Only provide one to generate an embedding.\nInput provided: {input}\nText Input provided: {text_input}" + ) + if text_input: + input = text_input + else: + with open(input, "r") as f: + text_input = f.readlines() + input = text_input + + device = "cuda" + model_input = {modality: modality_function([input], device)} + + with torch.no_grad(): + embeddings = self.model(model_input) + # print(type(embeddings)) + emb = embeddings[modality] + return emb.cpu().squeeze().tolist()