X-LANCE · ddlBoJack · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/README.md b/README.md
@@ -27,8 +27,10 @@ developers to train custom multimodal large language model (MLLM), focusing on <
 5. [Acknowledge](#acknowledge)
 
 # News
-- [Update Apr. 28, 2024] Recipes for automated audio captioning (AAC) with SOTA performance has been supported. 
-- [Update Mar. 31, 2024] Recipes for automatic speech recognition (ASR) with SOTA performance has been supported. 
+- [Update May. 8, 2024] Recipes for [visual speech recognition (VSR)](examples/vsr_LRS3/README.md) has been supported. 
+- [Update May. 4, 2024] Recipes for [zero-shot text-to-speech (TTS)](examples/vallex/README.md) has been supported. 
+- [Update Apr. 28, 2024] Recipes for [automated audio captioning (AAC)](examples/aac_audiocaps/README.md) has been supported. 
+- [Update Mar. 31, 2024] Recipes for [automatic speech recognition (ASR)](examples/asr_librispeech/README.md) has been supported. 
 
 # Installation
 ```bash
@@ -61,6 +63,7 @@ We provide reference implementations of various LLM-based speech, audio, and mus
 - **Speech Task**
     - [Automatic Speech Recognition (ASR)](examples/asr_librispeech/README.md)
     - [Text-to-Speech (TTS)](examples/vallex/README.md)
+    - [Visual Speech Recognition (VSR)](examples/vsr_LRS3/README.md)
 - **Audio Task**
     - [Automated Audio Captioning (AAC)](examples/aac_audiocaps/README.md)
 

diff --git a/examples/vsr_LRS3/model/slam_model_vsr.py b/examples/vsr_LRS3/model/slam_model_vsr.py
@@ -74,82 +74,4 @@ def __init__(
             train_config,
             model_config,
             **kwargs,
-        )
-
-
-    @torch.no_grad()
-    def inference(
-        self,
-        wav_path=None,
-        prompt=None,
-        generation_config=None,
-        logits_processor=None,
-        stopping_criteria=None,
-        prefix_allowed_tokens_fn=None,
-        synced_gpus=None,
-        assistant_model=None,
-        streamer=None,
-        negative_prompt_ids=None,
-        negative_prompt_attention_mask=None,
-        **kwargs,
-    ):
-        # inference for asr model
-
-        device = kwargs.get("device", "cuda")
-        if os.path.exists(wav_path):  # Audio-Text QA
-            import whisper
-
-            audio_raw = whisper.load_audio(wav_path)
-            audio_raw = whisper.pad_or_trim(audio_raw)
-
-            mel_size = getattr(
-                self.dataset_config, "mel_size", 80
-            )  # 80 for large v1 and v2, 128 for large v3
-            audio_mel = (
-                whisper.log_mel_spectrogram(audio_raw, n_mels=mel_size)
-                .permute(1, 0)[None, :, :]
-                .to(device)
-            )
-
-            encoder_outs = self.encoder.extract_variable_length_features(
-                audio_mel.permute(0, 2, 1)
-            )
-
-            if self.model_config.encoder_projector == "q-former":
-                audio_mel_post_mask = torch.ones(
-                    encoder_outs.size()[:-1], dtype=torch.long
-                ).to(encoder_outs.device)
-                encoder_outs = self.encoder_projector(encoder_outs, audio_mel_post_mask)
-            if self.model_config.encoder_projector == "linear":
-                encoder_outs = self.encoder_projector(encoder_outs)
-        else:  # Text QA
-            encoder_outs = torch.empty(
-                1, 0, self.llm.model.embed_tokens.embedding_dim
-            ).to(device)
-
-        prompt = "USER: {}\n ASSISTANT:".format(prompt)
-        prompt_ids = self.tokenizer.encode(prompt)
-        prompt_length = len(prompt_ids)
-        prompt_ids = torch.tensor(prompt_ids, dtype=torch.int64).to(device)
-
-        if hasattr(self.llm.model, "embed_tokens"):
-            inputs_embeds = self.llm.model.embed_tokens(prompt_ids)
-        elif hasattr(self.llm.model.model, "embed_tokens"):
-            inputs_embeds = self.llm.model.model.embed_tokens(prompt_ids)
-        else:
-            inputs_embeds = self.llm.model.model.model.embed_tokens(prompt_ids)
-
-        inputs_embeds = torch.cat(
-            (encoder_outs, inputs_embeds[None, :, :]), dim=1
-        )  # [audio,prompt]
-
-        attention_mask = torch.ones(inputs_embeds.size()[:-1], dtype=torch.long).to(
-            inputs_embeds.device
-        )
-
-        # generate
-        model_outputs = self.generate(
-            inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs
-        )
-
-        return model_outputs
+        )
diff --git a/src/slam_llm/models/AV/av_net.py b/src/slam_llm/models/AV/av_net.py