diff --git a/.gitignore b/.gitignore index 7d8c902..aafd43e 100644 --- a/.gitignore +++ b/.gitignore @@ -95,3 +95,4 @@ venv.bak/ **/dev-dist manager/db.sqlite3-journal manager/schema.yml +staging \ No newline at end of file diff --git a/justfile b/justfile index 31a96f6..3b107e6 100644 --- a/justfile +++ b/justfile @@ -28,12 +28,14 @@ manager: --name amuman-manager-staging \ --network amuman-staging \ -v ./staging:/manager \ + -v ./mock_nas:/mnt/smb \ -e SECRET_KEY=$SECRET_KEY \ -e DJANGO_SUPERUSER_EMAIL=$DJANGO_SUPERUSER_EMAIL \ -e DJANGO_SUPERUSER_USERNAME=$DJANGO_SUPERUSER_USERNAME \ -e DJANGO_SUPERUSER_PASSWORD=$DJANGO_SUPERUSER_PASSWORD \ -e DOMAIN=$DOMAIN \ -e REDIS_HOST=amuman-redis-staging \ + -e SHARED_FOLDER=/mnt/smb \ amuman-manager-staging node: @@ -42,9 +44,11 @@ node: podman run --rm -it --replace --tz local --pull newer \ --name amuman-node-staging \ --device=nvidia.com/gpu=all \ - -v ./mock_nas:/mnt/smb \ + -v ./mock_nas:/shared \ + -v ./staging/node_config:/config \ -e MANAGER_DOMAIN=$DOMAIN \ -e NODE_NAME=staging-node-1 \ + -e SHARED_FOLDER=/shared \ amuman-node-staging staging: frontend redis manager proxy diff --git a/manager/amuman/settings.py b/manager/amuman/settings.py index 44e4a3b..714c11c 100644 --- a/manager/amuman/settings.py +++ b/manager/amuman/settings.py @@ -117,9 +117,9 @@ "manager.middleware.scheduler_middleware.SchedulerMiddleware", ] if DEBUG: - MIDDLEWARE.append( - "manager.middleware.generate_initial_data.GenerateRandomJobsMiddleware" - ) + MIDDLEWARE.append("manager.middleware.generate_initial_data_debug.Generate") +else: + MIDDLEWARE.append("manager.middleware.generate_initial_data_prod.Generate") ROOT_URLCONF = "amuman.urls" diff --git a/manager/manager/components/check_mx3_file.py b/manager/manager/components/check_mx3_file.py index c1dda6b..118fc9f 100644 --- a/manager/manager/components/check_mx3_file.py +++ b/manager/manager/components/check_mx3_file.py @@ -1,11 +1,15 @@ import logging +import os from pathlib import Path log = logging.getLogger("rich") +SHARED_FOLDER = Path(os.environ.get("SHARED_FOLDER", "/mnt/smb")) + def validate_mx3_file(path_str: str) -> bool: - path = Path(path_str) + path = SHARED_FOLDER / Path(path_str) + if not path.exists(): log.error(f"File does not exist: {path}") return False diff --git a/manager/manager/middleware/generate_initial_data.py b/manager/manager/middleware/generate_initial_data_debug.py similarity index 94% rename from manager/manager/middleware/generate_initial_data.py rename to manager/manager/middleware/generate_initial_data_debug.py index 65553cb..412adba 100644 --- a/manager/manager/middleware/generate_initial_data.py +++ b/manager/manager/middleware/generate_initial_data_debug.py @@ -11,14 +11,12 @@ log = logging.getLogger("rich") -class GenerateRandomJobsMiddleware: +class Generate: def __init__(self, get_response): self.get_response = get_response self.generate_users() self.generate_random_jobs() - raise MiddlewareNotUsed( - "GenerateRandomJobsMiddleware is disabled after initial use." - ) + raise MiddlewareNotUsed("Generate is disabled after initial use.") def generate_users(self): if not CustomUser.objects.exists(): diff --git a/manager/manager/middleware/generate_initial_data_prod.py b/manager/manager/middleware/generate_initial_data_prod.py new file mode 100644 index 0000000..b33b645 --- /dev/null +++ b/manager/manager/middleware/generate_initial_data_prod.py @@ -0,0 +1,25 @@ +import logging + +from django.contrib.auth.models import User as AuthUser +from django.core.exceptions import MiddlewareNotUsed + +from manager.models import CustomUser + +log = logging.getLogger("rich") + + +class Generate: + def __init__(self, get_response): + self.get_response = get_response + self.generate_users() + raise MiddlewareNotUsed("Generate is disabled after initial use.") + + def generate_users(self): + if not CustomUser.objects.exists(): + admin = AuthUser.objects.get(username="admin") + self.admin_user = CustomUser(auth=admin, concurrent_jobs=20) + self.admin_user.save() + + def __call__(self, request): + response = self.get_response(request) + return response diff --git a/manager/manager/serializers.py b/manager/manager/serializers.py index 3eac553..f9dd1e5 100644 --- a/manager/manager/serializers.py +++ b/manager/manager/serializers.py @@ -29,7 +29,7 @@ def create(self, validated_data): "username": validated_data.pop("username"), "password": validated_data.pop("password"), "email": validated_data.pop("email"), - "is_active": False, + "is_active": True, } # Check if username already exists if User.objects.filter(username=user_data["username"]).exists(): diff --git a/node/Dockerfile b/node/Dockerfile index 9b5fd1c..8c06fa3 100644 --- a/node/Dockerfile +++ b/node/Dockerfile @@ -22,4 +22,4 @@ COPY . . RUN pip install . ENV SMB_MOUNT_POINT=/mnt/smb -ENTRYPOINT ["/app/entrypoint.sh"] \ No newline at end of file +CMD amuman-node \ No newline at end of file diff --git a/node/amuman_node/api.py b/node/amuman_node/api.py index 6105cee..2e44454 100644 --- a/node/amuman_node/api.py +++ b/node/amuman_node/api.py @@ -1,9 +1,9 @@ import logging -import os from typing import Any, Dict import requests +from amuman_node.config import Config from amuman_node.job import Job log = logging.getLogger("rich") @@ -12,23 +12,50 @@ class API: - def __init__(self): - self.url = f"{os.environ['MANAGER_URL']}/api" - self.node_user: str = os.environ["NODE_NAME"] - self.node_password: str = os.environ["NODE_PASSWORD"] + def __init__(self, config: Config): + self.config = config + self.url = f"https://{config.manager_domain}/api" log.debug(f"API URL: {self.url}") self.access_token = None self.refresh_token = None self.headers = {} + def create_user_if_doesnt_exist(self) -> None: + log.debug("Checking if node user exists...") + users = self.get_users().json() # ["results"] + log.debug(f"Users: {users}") + node_user_exists = any( + user["auth"]["username"] == self.config.name for user in users + ) + if not node_user_exists: + log.debug("Node user does not exist. Creating...") + self.post_user() + else: + log.debug("Node user exists.") + def authenticate(self) -> bool: + # self.create_user_if_doesnt_exist() + try: + res = self.post_user() + if res.status_code == 201: + log.debug("Node user created successfully") + else: + log.debug( + f"Node user creation failed: {res.status_code=}, {res.json()=}" + ) + except requests.exceptions.RequestException as e: + log.exception(f"Error creating the node user: {e}") + + log.debug("Authenticating...") try: + data = { + "username": self.config.name, + "password": self.config.password, + } + log.debug(f"Authenticating with {self.url}/token/ {data=}") response = requests.post( self.url + "/token/", - json={ - "username": self.node_user, - "password": self.node_password, - }, + json=data, ) log.debug( f"Authentication response: {response.status_code=}, {response.json()=}" @@ -45,7 +72,27 @@ def authenticate(self) -> bool: log.error("Unable to authenticate with the manager") return False - def register(self, data: Data) -> requests.Response: + def get_users(self) -> requests.Response: + res = requests.get( + self.url + "/users/", + headers=self.headers, + ) + return res + + def post_user(self) -> requests.Response: + data = { + "username": self.config.name, + "password": self.config.password, + "email": f"{self.config.name}@localhost", + } + res = requests.post( + self.url + "/users/", + headers=self.headers, + json=data, + ) + return res + + def post_node(self, data: Data) -> requests.Response: res = requests.post( self.url + "/nodes/", headers=self.headers, @@ -61,7 +108,7 @@ def post_gpu(self, data: Data) -> requests.Response: ) return res - def update_job(self, job: Job) -> requests.Response: + def put_job(self, job: Job) -> requests.Response: res = requests.put( self.url + f"/jobs/{job.id}/", headers=self.headers, diff --git a/node/amuman_node/config.py b/node/amuman_node/config.py new file mode 100644 index 0000000..95ddf33 --- /dev/null +++ b/node/amuman_node/config.py @@ -0,0 +1,46 @@ +import json +import logging +import os +import random +from pathlib import Path + +log = logging.getLogger("rich") + + +class Config: + def __init__(self): + self.name: str + self.password: str + self.manager_domain: str + self.read_config() + + def read_config(self): + path = Path("/config/config.json") + if path.exists(): + with open(path) as f: + data = json.load(f) + self.name = data.get("name") + self.password = data.get("password") + self.manager_domain = data.get("manager_domain") + log.debug( + f"Config read from file: {self.name=}, {self.password=}, {self.manager_domain=}" + ) + + self.name = os.getenv("NODE_NAME", os.getenv("HOST", str(int(1e12)))) + if self.password is None: + self.password = str(random.randint(0, int(1e12))) + self.manager_domain: str = os.getenv("MANAGER_DOMAIN", "localhost") + self.write_config() + log.debug(f"Config: {self.name=}, {self.password=}, {self.manager_domain=}") + + def write_config(self): + path = Path("/config/config.json") + config = { + "name": self.name, + "password": self.password, + "manager_domain": self.manager_domain, + } + # create the directory if it doesn't exist + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + json.dump(config, f, indent=4) diff --git a/node/amuman_node/job_manager.py b/node/amuman_node/job_manager.py index 3cbf820..cec84fc 100644 --- a/node/amuman_node/job_manager.py +++ b/node/amuman_node/job_manager.py @@ -1,6 +1,8 @@ import asyncio import logging +import os from datetime import datetime +from pathlib import Path from amuman_node.api import API @@ -8,13 +10,16 @@ log = logging.getLogger("rich") +SHARED_FOLDER = Path(os.environ.get("SHARED_FOLDER", "/mnt/smb")) + class JobRunner: - def __init__(self, node_id: int, api: API, job_id: int) -> None: + def __init__(self, node_id: int, api: API, job_id: int, gpu_device_id: int) -> None: self.node_id: int = node_id self.api: API = api self.subprocess: asyncio.subprocess.Process self.job: Job = self.api.get_job(job_id) + self.gpu_device_id = gpu_device_id self.async_task = asyncio.create_task(self.run_subprocess()) # TODO: Check if interrupted @@ -22,7 +27,12 @@ def __init__(self, node_id: int, api: API, job_id: int) -> None: # job.status = "Finished" async def run_subprocess(self) -> None: - cmd: list[str] = ["amumax", "-gpu=1", "-magnets=false", self.job.path] + cmd: list[str] = [ + "amumax", + f"-gpu={self.gpu_device_id}", + "-magnets=false", + str(SHARED_FOLDER / Path(self.job.path)), + ] log.debug(f"Starting subprocess for job ID: {self.job.id} with command: {cmd}") try: @@ -62,14 +72,16 @@ async def _handle_completion(self) -> None: log.debug(f"AMUmax exited with status {self.job.status.name}.") self.job.end_time = datetime.now().isoformat() try: - res = self.api.update_job(self.job) + res = self.api.put_job(self.job) + if res.status_code not in [200, 201]: + log.error( + f"Failed to update job ID: {self.job.id}. Status Code: {res.status_code}. Response: {res.json()}" + ) + return except Exception as e: log.error(f"Failed to update job ID: {self.job.id}. Error: {e}") return - log.debug( - f"Job ID: {self.job.id}(completed) updated with status: {self.job.status.name}." - ) - log.debug(f"Response: {res.json()}") + log.debug(f"Job ID: {self.job.id} updated with status: {self.job.status.name}.") async def _handle_error(self, error: Exception) -> None: error_message: str = "" @@ -87,7 +99,7 @@ async def _handle_error(self, error: Exception) -> None: log.error(error_message) self.job.status = JobStatus.INTERRUPTED self.job.error = error_message - res = self.api.update_job(self.job) + res = self.api.put_job(self.job) log.debug( f"Job ID: {self.job.id}(error) updated with status: {self.job.status.name}. Response: {res.json()}" ) @@ -97,4 +109,4 @@ async def stop_process(self) -> None: log.debug(f"Stopping amumax for job ID: {self.job.id}") self.subprocess.terminate() self.job.status = JobStatus.INTERRUPTED - self.api.update_job(self.job) + self.api.put_job(self.job) diff --git a/node/amuman_node/main.py b/node/amuman_node/main.py index 82d077d..72832f3 100644 --- a/node/amuman_node/main.py +++ b/node/amuman_node/main.py @@ -1,23 +1,16 @@ import asyncio import logging import os -import socket -import uuid -from typing import Any, Optional, Union +from typing import Any, Optional import requests import websockets from rich.logging import RichHandler -from websockets.exceptions import ( - ConnectionClosed, - ConnectionClosedError, - ConnectionClosedOK, -) from amuman_node.api import API +from amuman_node.config import Config from amuman_node.gpu_monitor import GPUMonitor -from amuman_node.job_manager import JobRunner -from amuman_node.websockets import Websockets, parse_message +from amuman_node.websockets import Websockets LOGLEVEL = os.environ.get("LOGLEVEL", "DEBUG").upper() @@ -33,31 +26,26 @@ logging.getLogger("websockets").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) +logging.getLogger("asyncio").setLevel(logging.WARNING) class NodeClient: def __init__(self) -> None: - self.api: API = API() - self.node_id: int = int(os.getenv("NODE_ID", 0)) - self.node_name: str = os.getenv("NODE_NAME", str(uuid.uuid1())) - self.node_user: str = os.getenv("NODE_USER", "admin") - self.node_password: str = os.getenv("NODE_PASSWORD", "admin") - log.debug(f"Node ID: {self.node_id}, Node Name: '{self.node_name}'") - self.ws = Websockets(self.api, self.node_id, self.node_name) - - self.reconnect_attempts: int = 10 - self.reconnect_delay: int = 10 + self.config: Config = Config() + self.api: API = API(self.config) self.gpm: Optional[GPUMonitor] = None - self.access_token: str - self.refresh_token: Optional[str] = None - self.is_registered = False - self.is_connected = False + self.ws: Websockets - self.reply_timeout = 10 - self.ping_timeout = 5 - self.sleep_time = 5 + self.run() - self.register_with_manager() + def run(self) -> None: + if self.register_with_manager(): + self.ws = Websockets(self.api, self.node_id, self.config.name, self.gpm) + try: + asyncio.run(self.ws.websocket_loop()) + except KeyboardInterrupt: + log.warning("Caught KeyboardInterrupt (Ctrl+C). Shutting down...") + # self.ws.close() def get_own_ip(self) -> str: try: @@ -68,22 +56,27 @@ def get_own_ip(self) -> str: log.exception(f"Unable to get the external IP: {err}") return "error" + def authenticate(self) -> bool: + if not self.api.authenticate(): + log.error("Authentication failed") + return False + return True + def register_with_manager(self) -> bool: if not self.api.authenticate(): return False data: dict[str, Any] = { - "name": self.node_name, + "name": self.config.name, "ip": self.get_own_ip(), - "number_of_gpus": 0, + "number_of_gpus": 0, # ??? } log.debug(f"Registering data: {data=}") try: - response = self.api.register(data) + response = self.api.post_node(data) if response.status_code in [200, 201]: self.node_id = int(response.json().get("id")) log.debug(f"Node registered: {self.node_id=}") - self.is_registered = True self.gpm = GPUMonitor(self.node_id, self.api) if response.status_code == 200: self.gpm.api_post("update") @@ -91,11 +84,9 @@ def register_with_manager(self) -> bool: self.gpm.api_post("assign") return True else: - self.is_registered = False log.error( - f"Failed to register node. Status Code: {response.status_code}" + f"Failed to register node. Status Code: {response.status_code} {response.text}" ) - log.debug(response.text) except requests.exceptions.ConnectionError: log.error(f"Couldn't connect to the manager: {self.api.url}") @@ -103,105 +94,9 @@ def register_with_manager(self) -> bool: log.exception(f"Error registering the node: {e}") return False - async def websocket_loop(self) -> None: - while True: - log.debug("WEBSOCKET: starting connection loop...") - try: - async with websockets.connect( - self.ws.url, extra_headers=self.api.headers - ) as ws: - while True: - try: - await self.ws.register(ws) - await self.handle_connection(ws) - except ( - asyncio.TimeoutError, - ConnectionClosed, - ConnectionClosedError, - ConnectionClosedOK, - ): - self.is_connected = False - self.is_registered = False - try: - pong = await ws.ping() - await asyncio.wait_for(pong, timeout=self.ping_timeout) - log.debug("Ping OK, keeping connection alive...") - continue - except Exception: - log.debug( - f"WEBSOCKET: Lost connection, retrying in {self.sleep_time}s" - ) - await asyncio.sleep(self.sleep_time) - break - except socket.gaierror: - self.is_connected = False - self.is_registered = False - log.debug( - f"Socket error - retrying connection in {self.sleep_time} sec (Ctrl-C to quit)" - ) - await asyncio.sleep(self.sleep_time) - continue - except ConnectionRefusedError: - self.is_connected = False - self.is_registered = False - log.debug( - "Nobody seems to listen to this endpoint. Please check the URL." - ) - log.debug( - f"Retrying connection in {self.sleep_time} sec (Ctrl-C to quit)" - ) - await asyncio.sleep(self.sleep_time) - continue - - async def handle_connection( - self, websocket: websockets.WebSocketClientProtocol - ) -> None: - while True: - try: - message: Union[str, bytes] = await websocket.recv() - if isinstance(message, bytes): - log.error("Received bytes instead of plain text from websocket") - else: - log.debug(f"Received message: {message}") - await self.process_message(message) - - except websockets.ConnectionClosed: - log.warning("Connection to the WebSocket server closed.") - break - - async def process_message(self, message: str | bytearray | memoryview) -> None: - if isinstance(message, str): - msg = parse_message(message) - if msg is None: - return - if msg.node_id != self.node_id: - log.debug("Command not for this node.") - return - if msg.command == "update_gpus": - log.info("Updating GPUs") - await self.execute_update_gpus() - elif msg.command == "run_job": - if msg.job_id is None: - log.error("No job_id in message") - return - log.info("Running job") - JobRunner(self.node_id, self.api, msg.job_id) - else: - log.error(f"Unknown command: {msg.command}") - - async def execute_update_gpus(self) -> None: - if self.gpm and len(self.gpm.gpus) > 0: - for gpu in self.gpm.gpus: - log.debug(f"Updating GPU: {gpu.device_id}") - gpu.update_status() - self.gpm.api_post("update") - def entrypoint() -> None: - try: - asyncio.run(NodeClient().websocket_loop()) - except KeyboardInterrupt: - log.warning("Caught KeyboardInterrupt (Ctrl+C). Shutting down...") + NodeClient() if __name__ == "__main__": diff --git a/node/amuman_node/websockets.py b/node/amuman_node/websockets.py index 2e9d04c..c3da344 100644 --- a/node/amuman_node/websockets.py +++ b/node/amuman_node/websockets.py @@ -1,11 +1,15 @@ +import asyncio import json import logging +import socket from typing import Optional, Union import websockets from pydantic import BaseModel from amuman_node.api import API +from amuman_node.gpu_monitor import GPUMonitor +from amuman_node.job_manager import JobRunner log = logging.getLogger("rich") @@ -27,11 +31,40 @@ def parse_message(message: str) -> Union[WebsocketMessage, None]: class Websockets: - def __init__(self, api: API, node_id: int, node_name: str) -> None: + def __init__(self, api: API, node_id: int, node_name: str, gpm) -> None: self.api: API = api self.node_name: str = node_name self.node_id: int = node_id + self.ping_timeout = 10 + self.sleep_time = 5 + self.gpm: GPUMonitor = gpm self.url = f"{self.api.url.replace('http','ws').replace('/api','')}/ws/node/?node_id={self.node_id}" + log.debug(f"Websocket URL: {self.url}") + self.ws: websockets.WebSocketClientProtocol + + async def websocket_loop(self): + while True: + log.debug("WEBSOCKET: starting connection loop...") + try: + async with websockets.connect( + self.url, extra_headers=self.api.headers + ) as ws: + self.ws = ws + while True: + if not await self.handle_connection_errors(ws): + break + await self.handle_connection(ws) + except (socket.gaierror, ConnectionRefusedError) as e: + error_msg = ( + "Socket error - retrying connection" + if isinstance(e, socket.gaierror) + else "Nobody seems to listen to this endpoint. Please check the URL." + ) + log.debug(f"{error_msg} in {self.sleep_time} sec (Ctrl-C to quit)") + await asyncio.sleep(self.sleep_time) + except websockets.exceptions.InvalidStatusCode as e: + log.error(f"Invalid status code: {e}") + await asyncio.sleep(self.sleep_time) async def register(self, ws: websockets.WebSocketClientProtocol) -> None: log.info("WEBSOCKET: Registering with the manager...") @@ -39,3 +72,69 @@ async def register(self, ws: websockets.WebSocketClientProtocol) -> None: WebsocketMessage(command="register", node_id=self.node_id).model_dump_json() ) log.info("WEBSOCKET: Connection started.") + + async def send_ping(self, ws): + pong = await ws.ping() + await asyncio.wait_for(pong, timeout=self.ping_timeout) + log.debug("Ping OK, keeping connection alive...") + + async def handle_connection( + self, websocket: websockets.WebSocketClientProtocol + ) -> None: + while True: + try: + message: Union[str, bytes] = await websocket.recv() + if isinstance(message, bytes): + log.error("Received bytes instead of plain text from websocket") + else: + log.debug(f"Received message: {message}") + await self.process_message(message) + + except websockets.ConnectionClosed: + log.warning("Connection to the WebSocket server closed.") + break + + async def process_message(self, message: str | bytearray | memoryview) -> None: + if isinstance(message, str): + msg = parse_message(message) + if msg is None: + return + if msg.node_id != self.node_id: + log.debug(f"Command not for this node. {msg.node_id=} {self.node_id=}") + return + if msg.command == "update_gpus": + log.info("Updating GPUs") + await self.execute_update_gpus() + elif msg.command == "run_job": + if msg.job_id is None: + log.error("No job_id in message") + return + if msg.gpu_device_id is None: + log.error("No gpu_device_id in message") + return + log.info("Running job") + JobRunner(self.node_id, self.api, msg.job_id, msg.gpu_device_id) + else: + log.error(f"Unknown command: {msg.command}") + else: + log.error("Received message is not a string") + + async def execute_update_gpus(self) -> None: + if self.gpm and len(self.gpm.gpus) > 0: + for gpu in self.gpm.gpus: + log.debug(f"Updating GPU: {gpu.device_id}") + gpu.update_status() + self.gpm.api_post("update") + + async def handle_connection_errors(self, ws): + try: + await self.send_ping(ws) + except Exception: + log.debug(f"WEBSOCKET: Lost connection, retrying in {self.sleep_time}s") + await asyncio.sleep(self.sleep_time) + return False + return True + + async def close(self): + log.info("Closing connection...") + await self.ws.close() diff --git a/node/amuman_node/ws_messages.py b/node/amuman_node/ws_messages.py deleted file mode 120000 index 52a1099..0000000 --- a/node/amuman_node/ws_messages.py +++ /dev/null @@ -1 +0,0 @@ -../../manager/manager/components/ws_messages.py \ No newline at end of file diff --git a/node/entrypoint.sh b/node/entrypoint.sh deleted file mode 100755 index b649f9c..0000000 --- a/node/entrypoint.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -set -e - -if [ -z "$MANAGER_URL" ]; then - echo "MANAGER_URL is not set" - exit 1 -fi -if [ -z "$NODE_NAME" ]; then - echo "NODE_NAME is not set" - exit 1 -fi -if [ -z "$NODE_PASSWORD" ]; then - echo "NODE_PASSWORD is not set" - exit 1 -fi - -echo "Starting server" -amuman-node \ No newline at end of file diff --git a/node/install_script.ps1 b/node/install_script.ps1 new file mode 100644 index 0000000..5536bfd --- /dev/null +++ b/node/install_script.ps1 @@ -0,0 +1,6 @@ +winget install podman +$env:Path = [System.Environment]::GetEnvironmentVariable("Path", "Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path", "User") +podman machine init +podman machine start +podman machine ssh "curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo && sudo yum install -y nvidia-container-toolkit && sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml" +podman run --rm -it --replace --tz local --pull newer --name amuman-node-staging --device=nvidia.com/gpu=all -e MANAGER_DOMAIN=amuman-staging.nyx.zfns.eu.org -e NODE_NAME=staging-node-1 -e SHARED_FOLDER=/shared ghcr.io/kkingstoun/amuman/node:0.0.7 \ No newline at end of file diff --git a/proxy/nginx.conf b/proxy/nginx.conf index 966ad95..48ea0e4 100644 --- a/proxy/nginx.conf +++ b/proxy/nginx.conf @@ -4,15 +4,29 @@ server { listen 80; server_name localhost; + resolver 10.89.10.1; + set $backend "http://amuman-manager-staging:8000"; location /api/ { - proxy_pass http://amuman-manager-staging:8000; + proxy_pass $backend; } location /admin/ { - proxy_pass http://amuman-manager-staging:8000; + proxy_pass $backend; } location /static/ { - proxy_pass http://amuman-manager-staging:8000; + proxy_pass $backend; + } + location /ws/node/ { + proxy_pass $backend; + + # WebSocket specific + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; } location / {