diff --git a/api/logs/ltx_api.log b/api/logs/ltx_api.log index 42b9871..08fe168 100644 --- a/api/logs/ltx_api.log +++ b/api/logs/ltx_api.log @@ -12,3 +12,14 @@ 2024-11-26 11:27:36.160 | INFO | __main__:main:347 - Starting LTX video generation server on port 8000 2024-11-26 11:31:41.717 | INFO | __main__:main:343 - Starting LTX video generation server on port 8000 2024-11-26 11:34:03.736 | INFO | __main__:main:343 - Starting LTX video generation server on port 8000 +2024-11-28 18:16:26.552 | INFO | __main__:main:343 - Starting LTX video generation server on port 8000 +2024-11-28 18:33:29.617 | ERROR | __main__:main:347 - Server failed to start: One or more workers failed to start. Shutting down LitServe +2024-11-28 18:35:14.855 | INFO | __main__:main:343 - Starting LTX video generation server on port 8000 +2024-11-28 18:42:27.070 | INFO | __main__:main:343 - Starting LTX video generation server on port 8000 +2024-11-28 18:56:53.323 | INFO | __main__:main:344 - Starting LTX video generation server on port 8000 +2024-11-28 19:03:09.065 | INFO | __main__:main:348 - Starting LTX video generation server on port 8000 +2024-11-28 19:03:54.590 | INFO | __main__:main:348 - Starting LTX video generation server on port 8000 +2024-11-28 19:05:52.633 | INFO | __main__:main:348 - Starting LTX video generation server on port 8000 +2024-11-28 19:09:10.161 | INFO | __main__:main:348 - Starting LTX video generation server on port 8000 +2024-11-28 19:11:22.845 | INFO | __main__:main:356 - Starting LTX video generation server on port 8000 +2024-11-28 19:13:03.394 | INFO | __main__:main:356 - Starting LTX video generation server on port 8000 diff --git a/api/ltx_serve.py b/api/ltx_serve.py index 471d84d..fb320f8 100644 --- a/api/ltx_serve.py +++ b/api/ltx_serve.py @@ -20,7 +20,8 @@ from configs.ltx_settings import LTXVideoSettings from scripts.ltx_inference import LTXInference -from scripts import mp4_to_s3_json +from scripts.mp4_to_s3_json import mp4_to_s3_json +import torch # Set up prometheus multiprocess mode os.environ["PROMETHEUS_MULTIPROC_DIR"] = "/tmp/prometheus_multiproc_dir" @@ -206,6 +207,20 @@ def predict(self, inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: logger.info(f"Starting generation for prompt: {generation_request.prompt}") self.engine.generate() + # Verify file exists and is readable before uploading + if not temp_video_path.exists(): + raise FileNotFoundError(f"Generated video file not found at {temp_video_path}") + + if not os.access(temp_video_path, os.R_OK): + raise PermissionError(f"Generated video file is not readable at {temp_video_path}") + + # Upload to S3 with explicit file opening + with open(temp_video_path, 'rb') as video_file: + s3_response = mp4_to_s3_json( + video_file, + f"ltx_{int(time.time())}.mp4" + ) + end_time = time.time() generation_time = end_time - start_time self.log("inference_time", generation_time) @@ -216,12 +231,6 @@ def predict(self, inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: "gpu_reserved": torch.cuda.memory_reserved() if torch.cuda.is_available() else 0 } - # Upload to S3 - s3_response = mp4_to_s3_json( - temp_video_path, - f"ltx_{int(time.time())}.mp4" - ) - result = { "status": "success", "video_id": s3_response["video_id"], @@ -236,10 +245,12 @@ def predict(self, inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: logger.info(f"Generation completed successfully") except Exception as e: - logger.error(f"Error in generation: {e}") + import traceback + logger.error(f"Error in generation: {e}\n{traceback.format_exc()}") results.append({ "status": "error", - "error": str(e) + "error": str(e), + "traceback": traceback.format_exc() }) finally: @@ -248,10 +259,12 @@ def predict(self, inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: torch.cuda.empty_cache() except Exception as e: - logger.error(f"Error in predict method: {e}") + import traceback + logger.error(f"Error in predict method: {e}\n{traceback.format_exc()}") results.append({ "status": "error", - "error": str(e) + "error": str(e), + "traceback": traceback.format_exc() }) return results if results else [{"status": "error", "error": "No results generated"}] diff --git a/configs/__pycache__/ltx_settings.cpython-311.pyc b/configs/__pycache__/ltx_settings.cpython-311.pyc index f4b1e8f..edb7680 100644 Binary files a/configs/__pycache__/ltx_settings.cpython-311.pyc and b/configs/__pycache__/ltx_settings.cpython-311.pyc differ diff --git a/configs/ltx_settings.py b/configs/ltx_settings.py index 0231bdc..bbe0bd5 100644 --- a/configs/ltx_settings.py +++ b/configs/ltx_settings.py @@ -1,7 +1,33 @@ """ Configuration module for LTX video generation model with HuggingFace Hub integration. + +This module provides a comprehensive configuration system for the LTX video generation model, +handling model downloads, parameter validation, and settings management. It uses Pydantic +for robust configuration validation and type checking. + +Key Features: + - Automatic model download from HuggingFace Hub + - Configurable video generation parameters + - Input/output path management + - Model checkpoint verification + - Device and precision settings + - Prompt configuration for generation + +Example: + >>> settings = LTXVideoSettings( + ... model_id="Lightricks/LTX-Video", + ... prompt="A beautiful sunset over the ocean", + ... num_frames=60 + ... ) + >>> settings.download_model() + >>> unet_path, vae_path, scheduler_path = settings.get_model_paths() """ +# Constants +MAX_HEIGHT: int = 720 +MAX_WIDTH: int = 1280 +MAX_NUM_FRAMES: int = 257 + from typing import Optional, Union from pathlib import Path import os @@ -14,6 +40,39 @@ class LTXVideoSettings(BaseSettings): """ Configuration settings for LTX video generation model. + + This class manages all configuration aspects of the LTX video generation pipeline, + including model paths, generation parameters, and output settings. It provides + validation and automatic type conversion for all settings. + + Attributes: + model_id (str): HuggingFace model identifier (default: "Lightricks/LTX-Video") + ckpt_dir (Path): Directory for model checkpoints + use_auth_token (Optional[str]): HuggingFace authentication token + input_video_path (Optional[Path]): Path to input video file + input_image_path (Optional[Path]): Path to input image file + output_path (Optional[Path]): Directory for output files + seed (int): Random seed for reproducible generation + num_inference_steps (int): Number of denoising steps (range: 1-100) + guidance_scale (float): Classifier-free guidance scale (range: 1.0-20.0) + height (int): Output video height in pixels (range: 256-720) + width (int): Output video width in pixels (range: 256-1280) + num_frames (int): Number of frames to generate (range: 1-257) + frame_rate (int): Output video frame rate (range: 1-60) + num_images_per_prompt (int): Number of videos per prompt (range: 1-4) + bfloat16 (bool): Whether to use bfloat16 precision + device (str): Device for inference ('cuda' or 'cpu') + prompt (Optional[str]): Generation prompt text + negative_prompt (str): Negative prompt for undesired features + + Example: + >>> settings = LTXVideoSettings( + ... prompt="A serene mountain landscape", + ... num_frames=60, + ... height=480, + ... width=704 + ... ) + >>> settings.download_model() """ # Model Settings @@ -33,10 +92,7 @@ class LTXVideoSettings(BaseSettings): input_video_path: Optional[Path] = Field(None, description="Path to input video file") input_image_path: Optional[Path] = Field(None, description="Path to input image file") - output_path: Optional[Path] = Field( - default_factory=lambda: Path("outputs"), - description="Path to save output files" - ) + output_path: Optional[Path] = Field(None, description="Path to save output files") # Generation Settings seed: int = Field(171198, description="Random seed for generation") @@ -44,16 +100,26 @@ class LTXVideoSettings(BaseSettings): guidance_scale: float = Field(3.0, ge=1.0, le=20.0, description="Guidance scale") # Video Parameters - height: int = Field(480, ge=256, le=720, description="Height of output video frames") - width: int = Field(704, ge=256, le=1280, description="Width of output video frames") - num_frames: int = Field(121, ge=1, le=257, description="Number of frames to generate") - frame_rate: int = Field(25, ge=1, le=60, description="Frame rate of output video") - num_images_per_prompt: int = Field( - 1, + height: int = Field( + 480, + ge=256, + le=MAX_HEIGHT, + description="Height of output video frames" + ) + width: int = Field( + 704, + ge=256, + le=MAX_WIDTH, + description="Width of output video frames" + ) + num_frames: int = Field( + 121, ge=1, - le=4, - description="Number of videos to generate per prompt" + le=MAX_NUM_FRAMES, + description="Number of frames to generate" ) + frame_rate: int = Field(25, ge=1, le=60, description="Frame rate of output video") + num_images_per_prompt: int = Field(1, ge=1, le=4, description="Number of videos per prompt") # Model Settings bfloat16: bool = Field(False, description="Use bfloat16 precision") @@ -75,8 +141,21 @@ def download_model(self) -> Path: """ Download model from HuggingFace Hub if not already present. + This method checks for existing model files, downloads missing components, + and verifies the integrity of the downloaded files. It handles authentication + for private models using the provided token. + Returns: Path: Path to the model checkpoint directory + + Raises: + ValueError: If model download is incomplete or verification fails + Exception: If download encounters network or permission errors + + Example: + >>> settings = LTXVideoSettings() + >>> model_path = settings.download_model() + >>> print(f"Model downloaded to {model_path}") """ try: logger.info(f"Checking for model in {self.ckpt_dir}") @@ -112,10 +191,24 @@ def download_model(self) -> Path: def _verify_model_files(self) -> bool: """ - Verify that all required model files are present. + Verify that all required model files are present in the checkpoint directory. + Checks for the existence of essential model components including the UNet, + VAE, and scheduler configurations and weights. + Returns: - bool: True if all required files are present + bool: True if all required files are present and accessible + + Note: + Required directory structure: + - unet/ + - config.json + - unet_diffusion_pytorch_model.safetensors + - vae/ + - config.json + - vae_diffusion_pytorch_model.safetensors + - scheduler/ + - scheduler_config.json """ required_dirs = ['unet', 'vae', 'scheduler'] required_files = { @@ -156,7 +249,20 @@ class Config: validate_assignment = True def get_model_paths(self) -> tuple[Path, Path, Path]: - """Get paths to model components after ensuring model is downloaded.""" + """ + Get paths to model components after ensuring model is downloaded. + + This method ensures the model is downloaded before returning paths to + the essential model components. + + Returns: + tuple[Path, Path, Path]: Paths to (unet_dir, vae_dir, scheduler_dir) + + Example: + >>> settings = LTXVideoSettings() + >>> unet, vae, scheduler = settings.get_model_paths() + >>> print(f"UNet path: {unet}") + """ # Ensure model is downloaded self.download_model() @@ -167,13 +273,33 @@ def get_model_paths(self) -> tuple[Path, Path, Path]: return unet_dir, vae_dir, scheduler_dir def get_output_resolution(self) -> tuple[int, int]: - """Get the output resolution as a tuple of (height, width).""" + """ + Get the output resolution as a tuple of (height, width). + + Returns: + tuple[int, int]: Video dimensions as (height, width) + + Example: + >>> settings = LTXVideoSettings(height=480, width=704) + >>> h, w = settings.get_output_resolution() + >>> print(f"Output resolution: {h}x{w}") + """ return (self.height, self.width) def get_padded_num_frames(self) -> int: """ Calculate the padded number of frames. - Ensures the number of frames is compatible with model requirements. + + Ensures the number of frames is compatible with model requirements by + padding to the nearest multiple of 8 frames if necessary. + + Returns: + int: Padded frame count that's compatible with the model + + Example: + >>> settings = LTXVideoSettings(num_frames=30) + >>> padded = settings.get_padded_num_frames() + >>> print(f"Padded frame count: {padded}") # Will be 32 """ # Common video models often require frame counts to be multiples of 8 FRAME_PADDING = 8 diff --git a/ltx b/ltx new file mode 160000 index 0000000..e61ec9c --- /dev/null +++ b/ltx @@ -0,0 +1 @@ +Subproject commit e61ec9ce9db99aadccc7675f580bd51052c78e0a diff --git a/scripts/__pycache__/ltx_inference.cpython-311.pyc b/scripts/__pycache__/ltx_inference.cpython-311.pyc index b0c2b93..2d3dbce 100644 Binary files a/scripts/__pycache__/ltx_inference.cpython-311.pyc and b/scripts/__pycache__/ltx_inference.cpython-311.pyc differ diff --git a/scripts/__pycache__/mp4_to_s3_json.cpython-311.pyc b/scripts/__pycache__/mp4_to_s3_json.cpython-311.pyc index 4c9cadd..08e545f 100644 Binary files a/scripts/__pycache__/mp4_to_s3_json.cpython-311.pyc and b/scripts/__pycache__/mp4_to_s3_json.cpython-311.pyc differ diff --git a/scripts/__pycache__/s3_manager.cpython-311.pyc b/scripts/__pycache__/s3_manager.cpython-311.pyc index 86b691a..849370a 100644 Binary files a/scripts/__pycache__/s3_manager.cpython-311.pyc and b/scripts/__pycache__/s3_manager.cpython-311.pyc differ diff --git a/scripts/ltx_inference.py b/scripts/ltx_inference.py index d591b37..3adb544 100644 --- a/scripts/ltx_inference.py +++ b/scripts/ltx_inference.py @@ -37,12 +37,7 @@ class LTXInference: """ def __init__(self, config: LTXVideoSettings): - """ - Initialize the inference pipeline with given configuration. - - Args: - config: LTXVideoSettings configuration object - """ + """Initialize with settings""" self.config = config self.setup_random_seeds() self.pipeline = self._initialize_pipeline() @@ -86,7 +81,7 @@ def _load_unet(self, unet_dir: Path) -> Transformer3DModel: return transformer def _initialize_pipeline(self) -> LTXVideoPipeline: - """Initialize the complete LTX pipeline with all components""" + """Initialize pipeline with all components""" unet_dir, vae_dir, scheduler_dir = self.config.get_model_paths() # Load models @@ -95,7 +90,7 @@ def _initialize_pipeline(self) -> LTXVideoPipeline: scheduler = self._load_scheduler(scheduler_dir) patchifier = SymmetricPatchifier(patch_size=1) - # Load text encoder and tokenizer + # Load text encoder and tokenizer from PixArt text_encoder = T5EncoderModel.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder" @@ -111,7 +106,7 @@ def _initialize_pipeline(self) -> LTXVideoPipeline: if self.config.bfloat16 and unet.dtype != torch.bfloat16: unet = unet.to(torch.bfloat16) - # Initialize pipeline with all components + # Initialize pipeline pipeline = LTXVideoPipeline( transformer=unet, patchifier=patchifier, @@ -165,35 +160,41 @@ def load_input_image(self) -> Optional[torch.Tensor]: return frame_tensor.unsqueeze(0).unsqueeze(2) def generate(self) -> None: - """Run the main generation pipeline""" + """Run generation pipeline""" # Load input image if provided media_items_prepad = self.load_input_image() - # Calculate dimensions - height_padded, width_padded = self.config.get_output_resolution() - num_frames_padded = self.config.get_padded_num_frames() + # Calculate dimensions with padding + height = self.config.height + width = self.config.width + num_frames = self.config.num_frames - logger.info(f"Generating with dimensions: {height_padded}x{width_padded}x{num_frames_padded}") + # Validate dimensions + if height > self.config.MAX_HEIGHT or width > self.config.MAX_WIDTH or num_frames > self.config.MAX_NUM_FRAMES: + logger.warning( + f"Input resolution or number of frames {height}x{width}x{num_frames} is too big, " + f"it is suggested to use the resolution below {self.config.MAX_HEIGHT}x{self.config.MAX_WIDTH}x{self.config.MAX_NUM_FRAMES}." + ) + + # Adjust dimensions to be divisible by 32 and num_frames to be (N * 8 + 1) + height_padded = ((height - 1) // 32 + 1) * 32 + width_padded = ((width - 1) // 32 + 1) * 32 + num_frames_padded = ((num_frames - 2) // 8 + 1) * 8 + 1 - # Calculate padding - padding = self._calculate_padding( - self.config.height, - self.config.width, - height_padded, - width_padded - ) + logger.info(f"Padded dimensions: {height_padded}x{width_padded}x{num_frames_padded}") - # Pad input media if present + # Calculate and apply padding + padding = self._calculate_padding(height, width, height_padded, width_padded) if media_items_prepad is not None: media_items = F.pad(media_items_prepad, padding, mode="constant", value=-1) else: media_items = None - # Prepare generation parameters + # Set up generator generator = torch.Generator( device="cuda" if torch.cuda.is_available() else "cpu" ).manual_seed(self.config.seed) - + # Run pipeline images = self.pipeline( prompt=self.config.prompt,