From 73ff9c5bd076bf5fd1025338126132edb67bff8d Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Tue, 13 Aug 2024 00:17:57 +0530 Subject: [PATCH] Add seamless text-to-speech and text-to-text translation with updated dependencies - Create `seamless_v2.py` for new seamless text-to-speech and text-to-text translation tasks. - Update Dockerfile to use CUDA 12.4.1 and Ubuntu 22.04. - Upgrade dependencies in `requirements.txt`. - Modify `model-values.yaml` to update image reference, GPU, and memory limits. - Simplify `seamless_asr.py` for seamless ASR only (as huggingface expects a different model class for asr vs other tasks) - Fix `run-dev.sh` to enable NVIDIA runtime and GPU usage. --- chart/model-values.yaml | 5 +- common/Dockerfile | 2 +- common/deepfloyd.py | 137 --------------- common/pipeline_if_sr_patch.py | 308 --------------------------------- common/requirements.txt | 18 +- common/seamless_asr.py | 122 +++---------- common/seamless_v2.py | 83 +++++++++ scripts/run-dev.sh | 2 +- 8 files changed, 121 insertions(+), 556 deletions(-) delete mode 100644 common/deepfloyd.py delete mode 100644 common/pipeline_if_sr_patch.py create mode 100644 common/seamless_v2.py diff --git a/chart/model-values.yaml b/chart/model-values.yaml index 34be57c..38c6501 100644 --- a/chart/model-values.yaml +++ b/chart/model-values.yaml @@ -195,9 +195,10 @@ deployments: bark - name: "common-seamless" - image: "us-docker.pkg.dev/dara-c1b52/cloudbuild/gooey-gpu/common@sha256:5fb0ffa128cbdda86747fedf5ef68e9df8256735d8535149c6fffa41a3749883" + image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-common:3" limits: - memory: "19Gi" + gpu: "10Gi" + memory: "28Gi" # (220 / 80) * 10 cpu: "1" env: IMPORTS: |- diff --git a/common/Dockerfile b/common/Dockerfile index 0f31852..6db0dec 100644 --- a/common/Dockerfile +++ b/common/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.2.0-devel-ubuntu20.04 +FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 ARG DEBIAN_FRONTEND=noninteractive diff --git a/common/deepfloyd.py b/common/deepfloyd.py deleted file mode 100644 index ed2b2b4..0000000 --- a/common/deepfloyd.py +++ /dev/null @@ -1,137 +0,0 @@ -from functools import lru_cache - -import torch -from diffusers import ( - IFPipeline, - StableDiffusionUpscalePipeline, -) -from fastapi import APIRouter -from pydantic import BaseModel -from transformers import T5EncoderModel - -import gooey_gpu -from common.diffusion import safety_checker_wrapper -from common.pipeline_if_sr_patch import IFSuperResolutionPipelinePatch - -app = APIRouter(prefix="/deepfloyd_if") - - -class PipelineInfo(BaseModel): - upload_urls: list[str] = [] - model_id: tuple[str, str, str] - seed: int = 42 - disable_safety_checker: bool = False - - -class DeepfloydInputs(BaseModel): - prompt: list[str] - negative_prompt: list[str] = None - num_inference_steps: tuple[int, int, int] = (100, 50, 75) - num_images_per_prompt: int = 1 - guidance_scale: tuple[float, float, float] = (7, 4, 9) - - -class Text2ImgInputs(DeepfloydInputs): - width: int - height: int - - -@app.post("/text2img/") -@gooey_gpu.endpoint -def text2img(pipeline: PipelineInfo, inputs: Text2ImgInputs): - output_images = _run_model(pipeline, inputs) - gooey_gpu.upload_images(output_images, pipeline.upload_urls) - - -@gooey_gpu.gpu_task -def _run_model(pipeline: PipelineInfo, inputs: Text2ImgInputs): - pipe1 = load_pipe1(pipeline.model_id[0]) - pipe2 = load_pipe2(pipeline.model_id[1]) - pipe3 = load_pipe3(pipeline.model_id[2]) - - inputs.prompt *= inputs.num_images_per_prompt - if inputs.negative_prompt: - inputs.negative_prompt *= inputs.num_images_per_prompt - - with gooey_gpu.use_models(pipe1), torch.inference_mode(): - generator = torch.Generator().manual_seed(pipeline.seed) - # custom safety checker impl - safety_checker_wrapper(pipe1, disabled=pipeline.disable_safety_checker) - # Create text embeddings - prompt_embeds, negative_embeds = pipe1.encode_prompt( - inputs.prompt, negative_prompt=inputs.negative_prompt - ) - # The main diffusion process - images = pipe1( - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_embeds, - guidance_scale=inputs.guidance_scale[0], - num_inference_steps=inputs.num_inference_steps[0], - output_type="pt", - generator=generator, - width=inputs.width // 16, - height=inputs.height // 16, - ).images - - with gooey_gpu.use_models(pipe2), torch.inference_mode(): - # custom safety checker impl - safety_checker_wrapper(pipe2, disabled=pipeline.disable_safety_checker) - # Super Resolution 64x64 to 256x256 - images = pipe2( - image=images, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_embeds, - guidance_scale=inputs.guidance_scale[1], - num_inference_steps=inputs.num_inference_steps[1], - output_type="pt", - generator=generator, - ).images - - with gooey_gpu.use_models(pipe3), torch.inference_mode(): - # custom safety checker impl - safety_checker_wrapper(pipe3, disabled=pipeline.disable_safety_checker) - # Super Resolution 256x256 to 1024x1024 - output_images = pipe3( - image=images, - prompt=inputs.prompt, - negative_prompt=inputs.negative_prompt, - guidance_scale=inputs.guidance_scale[2], - num_inference_steps=inputs.num_inference_steps[2], - generator=generator, - ).images - - return output_images - - -@lru_cache -def load_pipe1(model_id: str): - # text_encoder = T5EncoderModel.from_pretrained( - # model_id, - # subfolder="text_encoder", - # load_in_8bit=True, - # variant="8bit", - # ) - return IFPipeline.from_pretrained( - model_id, - # text_encoder=text_encoder, - variant="fp16", - torch_dtype=torch.float16, - ) - - -@lru_cache -def load_pipe2(model_id: str): - return IFSuperResolutionPipelinePatch.from_pretrained( - model_id, - text_encoder=None, # no use of text encoder => memory savings! - variant="fp16", - torch_dtype=torch.float16, - ) - - -@lru_cache -def load_pipe3(model_id: str): - return StableDiffusionUpscalePipeline.from_pretrained( - model_id, - torch_dtype=torch.float16, - ) diff --git a/common/pipeline_if_sr_patch.py b/common/pipeline_if_sr_patch.py deleted file mode 100644 index 4cbc7b2..0000000 --- a/common/pipeline_if_sr_patch.py +++ /dev/null @@ -1,308 +0,0 @@ -from typing import Any, Callable, Dict, List, Optional, Union - -import PIL -import numpy as np -import torch -import torch.nn.functional as F -from diffusers import IFSuperResolutionPipeline -from diffusers.pipelines.deepfloyd_if import IFPipelineOutput -from diffusers.utils import ( - randn_tensor, -) - - -class IFSuperResolutionPipelinePatch(IFSuperResolutionPipeline): - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None, - num_inference_steps: int = 50, - timesteps: List[int] = None, - guidance_scale: float = 4.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: int = 1, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - noise_level: int = 250, - clean_caption: bool = True, - ): - """ - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`): - The image to be upscaled. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` - timesteps are used. Must be in descending order. - guidance_scale (`float`, *optional*, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. - callback (`Callable`, *optional*): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, *optional*, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). - noise_level (`int`, *optional*, defaults to 250): - The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)` - clean_caption (`bool`, *optional*, defaults to `True`): - Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to - be installed. If the dependencies are not installed, the embeddings will be created from the raw - prompt. - - Examples: - - Returns: - [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When - returning a tuple, the first element is a list with the generated images, and the second element is a list - of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) - or watermarked content, according to the `safety_checker`. - """ - # 1. Check inputs. Raise error if not correct - - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - self.check_inputs( - prompt, - image, - batch_size, - noise_level, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - ) - - # 2. Define call parameters - - # height = self.unet.config.sample_size - # width = self.unet.config.sample_size - height, width = image.shape[2:] - width *= 4 - height *= 4 - - device = self._execution_device - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - prompt_embeds, negative_prompt_embeds = self.encode_prompt( - prompt, - do_classifier_free_guidance, - num_images_per_prompt=num_images_per_prompt, - device=device, - negative_prompt=negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - clean_caption=clean_caption, - ) - - if do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) - - # 4. Prepare timesteps - if timesteps is not None: - self.scheduler.set_timesteps(timesteps=timesteps, device=device) - timesteps = self.scheduler.timesteps - num_inference_steps = len(timesteps) - else: - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - - # 5. Prepare intermediate images - num_channels = self.unet.config.in_channels // 2 - intermediate_images = self.prepare_intermediate_images( - batch_size * num_images_per_prompt, - num_channels, - height, - width, - prompt_embeds.dtype, - device, - generator, - ) - - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7. Prepare upscaled image and noise level - image = self.preprocess_image(image, num_images_per_prompt, device) - upscaled = F.interpolate( - image, (height, width), mode="bilinear", align_corners=True - ) - - noise_level = torch.tensor( - [noise_level] * upscaled.shape[0], device=upscaled.device - ) - noise = randn_tensor( - upscaled.shape, - generator=generator, - device=upscaled.device, - dtype=upscaled.dtype, - ) - upscaled = self.image_noising_scheduler.add_noise( - upscaled, noise, timesteps=noise_level - ) - - if do_classifier_free_guidance: - noise_level = torch.cat([noise_level] * 2) - - # HACK: see comment in `enable_model_cpu_offload` - if ( - hasattr(self, "text_encoder_offload_hook") - and self.text_encoder_offload_hook is not None - ): - self.text_encoder_offload_hook.offload() - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - model_input = torch.cat([intermediate_images, upscaled], dim=1) - - model_input = ( - torch.cat([model_input] * 2) - if do_classifier_free_guidance - else model_input - ) - model_input = self.scheduler.scale_model_input(model_input, t) - - # predict the noise residual - noise_pred = self.unet( - model_input, - t, - encoder_hidden_states=prompt_embeds, - class_labels=noise_level, - cross_attention_kwargs=cross_attention_kwargs, - ).sample - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred_uncond, _ = noise_pred_uncond.split( - model_input.shape[1] // 2, dim=1 - ) - noise_pred_text, predicted_variance = noise_pred_text.split( - model_input.shape[1] // 2, dim=1 - ) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond - ) - noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) - - # compute the previous noisy sample x_t -> x_t-1 - intermediate_images = self.scheduler.step( - noise_pred, t, intermediate_images, **extra_step_kwargs - ).prev_sample - - # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 - ): - progress_bar.update() - if callback is not None and i % callback_steps == 0: - callback(i, t, intermediate_images) - - image = intermediate_images - - if output_type == "pil": - # 9. Post-processing - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).float().numpy() - - # 10. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, device, prompt_embeds.dtype - ) - - # 11. Convert to PIL - image = self.numpy_to_pil(image) - - # 12. Apply watermark - if self.watermarker is not None: - self.watermarker.apply_watermark(image, self.unet.config.sample_size) - elif output_type == "pt": - nsfw_detected = None - watermark_detected = None - - if ( - hasattr(self, "unet_offload_hook") - and self.unet_offload_hook is not None - ): - self.unet_offload_hook.offload() - else: - # 9. Post-processing - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).float().numpy() - - # 10. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, device, prompt_embeds.dtype - ) - - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() - - if not return_dict: - return (image, nsfw_detected, watermark_detected) - - return IFPipelineOutput( - images=image, - nsfw_detected=nsfw_detected, - watermark_detected=watermark_detected, - ) diff --git a/common/requirements.txt b/common/requirements.txt index 63027cf..a932a1b 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -7,19 +7,19 @@ accelerate ~= 0.20.3 celery ~= 5.3.0 pydantic ~= 1.10.9 redis ~= 4.5.5 -transformers ~= 4.40.1 +transformers ~= 4.44.0 ## pytorch ---extra-index-url https://download.pytorch.org/whl/cu118 -torch ~= 2.0.0 -torchvision ~= 0.15.1 -torchaudio ~= 2.0.1 +--extra-index-url https://download.pytorch.org/whl/cu124 +torch ~= 2.4.0 +torchvision ~= 0.19.0 +torchaudio ~= 2.4.0 ## huggingface diffusers -diffusers ~= 0.21.1 -sentencepiece ~= 0.1.99 -torchsde ~= 0.2.5 -xformers ~= 0.0.20 +diffusers ~= 0.30.0 +sentencepiece ~= 0.2.0 +torchsde ~= 0.2.6 +xformers ~= 0.0.27 ## controlnet controlnet_aux ~= 0.0.1 diff --git a/common/seamless_asr.py b/common/seamless_asr.py index 78b7300..f7934ef 100644 --- a/common/seamless_asr.py +++ b/common/seamless_asr.py @@ -7,109 +7,40 @@ import transformers from pydantic import BaseModel -import io -from scipy.io.wavfile import write - import gooey_gpu from api import AsrOutput from celeryconfig import app, setup_queues -class SeamlessM4TPipeline(BaseModel): - upload_urls: typing.List[str] = [] - model_id: typing.Literal["facebook/seamless-m4t-v2-large"] = ( - "facebook/seamless-m4t-v2-large" - ) +class SeamlessASRPipeline(BaseModel): + model_id: str -class SeamlessM4TInputs(BaseModel): - audio: str | None = None # required for ASR, S2ST, and S2TT - text: str | None = None # required for T2ST and T2TT - src_lang: str | None = None # required for T2ST and T2TT - tgt_lang: str | None = None # ignored for ASR (only src_lang is used) - # seamless uses ISO 639-3 codes for languages +class SeamlessASRInputs(BaseModel): + audio: str + src_lang: str | None = None + tgt_lang: str | None = None chunk_length_s: float = 30 stride_length_s: typing.Tuple[float, float] = (6, 0) batch_size: int = 16 - speaker_id: int = 0 # only used for T2ST, value in [0, 200) - -@app.task(name="seamless.t2st") -@gooey_gpu.endpoint -def seamless_text2speech_translation( - pipeline: SeamlessM4TPipeline, - inputs: SeamlessM4TInputs, -) -> None: - _, processor, model = load_pipe(pipeline.model_id) - tgt_lang = inputs.tgt_lang or inputs.src_lang or "eng" - - assert inputs.text is not None - assert inputs.src_lang is not None - text_inputs = processor( - text=inputs.text, src_lang=inputs.src_lang, return_tensors="pt" - ) - - audio_array_from_text = ( - model.generate(**text_inputs, tgt_lang=tgt_lang, speaker_id=inputs.speaker_id)[ - 0 - ] - .cpu() - .numpy() - .squeeze() - ) - - bytes_wav = bytes() - byte_io = io.BytesIO(bytes_wav) - write(byte_io, 16000, audio_array_from_text) - audio_bytes = byte_io.read() - gooey_gpu.upload_audio_from_bytes(audio_bytes, pipeline.upload_urls[0]) - return - - -@app.task(name="seamless.t2tt") -@gooey_gpu.endpoint -def seamless_text2text_translation( - pipeline: SeamlessM4TPipeline, - inputs: SeamlessM4TInputs, -) -> AsrOutput | None: - _, processor, model = load_pipe(pipeline.model_id) - tgt_lang = inputs.tgt_lang or inputs.src_lang or "eng" - - assert inputs.text is not None - assert inputs.src_lang is not None - text_inputs = processor( - text=inputs.text, src_lang=inputs.src_lang, return_tensors="pt" - ) - - output_tokens = model.generate( - **text_inputs, tgt_lang=tgt_lang, generate_speech=False - ) - translated_text_from_text = processor.decode( - output_tokens[0].tolist()[0], skip_special_tokens=True - ) - - return AsrOutput(text=translated_text_from_text) - - -@app.task(name="seamless") +@app.task(name="seamless.asr") @gooey_gpu.endpoint def seamless_asr( - pipeline: SeamlessM4TPipeline, - inputs: SeamlessM4TInputs, -) -> AsrOutput | None: - pipe, _, _ = load_pipe(pipeline.model_id) - tgt_lang = inputs.tgt_lang or inputs.src_lang or "eng" - - assert inputs.audio is not None - + pipeline: SeamlessASRPipeline, + inputs: SeamlessASRInputs, +) -> AsrOutput: audio = requests.get(inputs.audio).content + pipe = load_pipe(pipeline.model_id) previous_src_lang = pipe.tokenizer.src_lang if inputs.src_lang: pipe.tokenizer.src_lang = inputs.src_lang + tgt_lang = inputs.tgt_lang or inputs.src_lang + prediction = pipe( audio, # see https://colab.research.google.com/drive/1rS1L4YSJqKUH_3YxIQHBI982zso23wor#scrollTo=Ca4YYdtATxzo&line=5&uniqifier=1 @@ -125,23 +56,18 @@ def seamless_asr( @lru_cache -def load_pipe( - model_id: str, -) -> typing.Tuple[ - transformers.AutomaticSpeechRecognitionPipeline, - transformers.SeamlessM4TProcessor, - transformers.SeamlessM4Tv2Model, -]: - print(f"Loading asr model {model_id!r}...") - pipe = transformers.pipeline( - task="automatic-speech-recognition", - model=model_id, - device=gooey_gpu.DEVICE_ID, - torch_dtype=torch.float16, +def load_pipe(model_id: str) -> transformers.AutomaticSpeechRecognitionPipeline: + print(f"Loading seamless m4t pipeline {model_id!r}...") + pipe = typing.cast( + transformers.AutomaticSpeechRecognitionPipeline, + transformers.pipeline( + task="automatic-speech-recognition", + model=model_id, + device=gooey_gpu.DEVICE_ID, + torch_dtype=torch.float16, + ), ) - processor = transformers.AutoProcessor.from_pretrained(model_id) - model = transformers.SeamlessM4Tv2Model.from_pretrained(model_id) - return pipe, processor, model + return pipe setup_queues( diff --git a/common/seamless_v2.py b/common/seamless_v2.py new file mode 100644 index 0000000..e7a8c7e --- /dev/null +++ b/common/seamless_v2.py @@ -0,0 +1,83 @@ +import typing +from functools import lru_cache + +import transformers +from pydantic import BaseModel + +import gooey_gpu +from celeryconfig import app + + +class SeamlessPipeline(BaseModel): + upload_urls: typing.List[str] = [] + model_id: str + + +class SeamlessT2STInputs(BaseModel): + text: str + src_lang: str + tgt_lang: str + speaker_id: int = 0 # [0, 200) + + +@app.task(name="seamless.t2st") +@gooey_gpu.endpoint +def seamless_text_to_speech_translation( + pipeline: SeamlessPipeline, + inputs: SeamlessT2STInputs, +) -> None: + model, processor = load_model(pipeline.model_id) + text_inputs = processor( + text=inputs.text, src_lang=inputs.src_lang, return_tensors="pt" + ).to(gooey_gpu.DEVICE_ID) + audio_array_from_text = ( + model.generate( + **text_inputs, tgt_lang=inputs.tgt_lang, speaker_id=inputs.speaker_id + )[0] + .cpu() + .numpy() + .squeeze() + ) + gooey_gpu.upload_audio(audio_array_from_text, pipeline.upload_urls[0]) + + +class SeamlessT2TTInputs(BaseModel): + text: str + src_lang: str + tgt_lang: str + + +@app.task(name="seamless.t2tt") +@gooey_gpu.endpoint +def seamless_text2text_translation( + pipeline: SeamlessPipeline, + inputs: SeamlessT2TTInputs, +) -> str: + model, processor = load_model(pipeline.model_id) + text_inputs = processor( + text=inputs.text, src_lang=inputs.src_lang, return_tensors="pt" + ).to(gooey_gpu.DEVICE_ID) + output_tokens = model.generate( + **text_inputs, tgt_lang=inputs.tgt_lang, generate_speech=False + ) + translated_text_from_text = processor.decode( + output_tokens[0].tolist()[0], skip_special_tokens=True + ) + return translated_text_from_text + + +@lru_cache +def load_model(model_id: str) -> typing.Tuple[ + transformers.SeamlessM4Tv2Model, + transformers.SeamlessM4TProcessor, +]: + print(f"Loading seamless m4t model {model_id!r}...") + model = typing.cast( + transformers.SeamlessM4Tv2Model, + transformers.AutoModel.from_pretrained(model_id).to(gooey_gpu.DEVICE_ID), + ) + processor = typing.cast( + transformers.SeamlessM4TProcessor, + transformers.AutoProcessor.from_pretrained(model_id), + ) + return model, processor diff --git a/scripts/run-dev.sh b/scripts/run-dev.sh index 873a91c..79c53bb 100755 --- a/scripts/run-dev.sh +++ b/scripts/run-dev.sh @@ -75,7 +75,7 @@ docker run \ -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ -v $HOME/.cache/huggingface:/root/.cache/huggingface \ -v $HOME/.cache/torch:/root/.cache/torch \ - --net host \ + --net host --runtime=nvidia --gpus all \ --memory 14g \ -it --rm --name $IMG \ $IMG:latest