From 1ebe67bb6328e26ae9f907b44875904f7f910eb4 Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 24 Oct 2024 18:50:47 +0200 Subject: [PATCH] enabled uploads to the pCloud repository. --- .../lcdb/db/_pcloud_repository.py | 122 +++++++++++++++++- 1 file changed, 117 insertions(+), 5 deletions(-) diff --git a/publications/2023-neurips/lcdb/db/_pcloud_repository.py b/publications/2023-neurips/lcdb/db/_pcloud_repository.py index a20469b..8e8f7e9 100644 --- a/publications/2023-neurips/lcdb/db/_pcloud_repository.py +++ b/publications/2023-neurips/lcdb/db/_pcloud_repository.py @@ -16,15 +16,31 @@ class PCloudRepository(Repository): - def __init__(self, repo_code): + def __init__(self, repo_code, token=None): super().__init__() self.repo_code = repo_code - response = requests.get(f"https://api.pcloud.com/showpublink?code={self.repo_code}") - self.content = response.json() + self.content = None + self.token = token + + # update content + self.update_content() + + def update_content(self): + self.content = requests.get(f"https://api.pcloud.com/showpublink?code={self.repo_code}").json() def exists(self): return self.content is not None and len(self.content) > 0 + def authenticate(self, username, password): + url = f"https://api.pcloud.com/userinfo?getauth=1&logout=1&device=lcdbclient" + response = requests.post(url, { + "username": username, + "password": password + }).json() + self.token = response["auth"] if "auth" in response else None + if self.token is None: + raise ValueError(f"Authentication failed. Response from server was {response}.") + def read_result_file(self, file, usecols=None): # get download link @@ -50,8 +66,104 @@ def read_result_file(self, file, usecols=None): else: print(f"Failed to fetch the file. Status code: {response.status_code}") + def _get_folder_id(self, workflow=None, campaign=None, openmlid=None): + """ + Returns the folderid of the folder containing the data for this context. + Returns None if that folder does not exist + """ + + # create query + query = "metadata .contents[? name == 'data'] | [0]" + if workflow is not None: + query += f".contents | [? name == '{workflow}'] | [0]" + if campaign is not None: + query += f".contents | [? name == '{campaign}'] | [0]" + if openmlid is not None: + query += f".contents | [? name == '{openmlid}'] | [0]" + + elif openmlid is not None: + raise ValueError("openmlid can be only set if both a workflow and a campaign are given.") + + elif campaign is not None or openmlid is not None: + raise ValueError("campaign or openmlid can be only set if a workflow is given.") + + return jmespath.compile(f"{query}.folderid").search(self.content) + + def _create_folder(self, parent_folder_id, name): + response = requests.get( + f"https://api.pcloud.com/createfolder?code={self.repo_code}&auth={self.token}&folderid={parent_folder_id}&name={name}" + ).json() + if response is None: + raise ValueError(f"Could not create folder '{name}', received no response") + if "result" not in response or response["result"] != 0: + raise ValueError(f"Could not create folder '{name}', received invalid response: {response}") + self.update_content() + return response["metadata"]["folderid"] + def add_results(self, campaign, *result_files): - raise NotImplementedError + self.update_content() # make sure that we have the current file structure at pCloud + for result_file in result_files: + + # read result file + if result_file.endswith((".gz", ".gzip")): + with gzip.GzipFile(result_file, "rb") as f: + df = pd.read_csv(f) + else: + df = pd.read_csv(result_file) + + # decompose this dataframe so that we have results only for a single workflow/openmlid and seeds + for (workflow, openmlid, workflow_seed, valid_seed, test_seed), group in df.groupby( + ["m:workflow", "m:openmlid", "m:workflow_seed", "m:valid_seed", "m:test_seed"] + ): + name = f"{int(workflow_seed)}-{int(test_seed)}-{int(valid_seed)}.csv.gz" + print(f"Adding results for {workflow}/{campaign}/{openmlid}/{name}") + folder_id = self._get_folder_id(workflow=workflow, campaign=campaign, openmlid=openmlid) + + # if the folder does not exist, create one + if folder_id is None: + + folder_id_root = self._get_folder_id() + + # create workflow folder if necessary + folder_id_workflow = self._get_folder_id(workflow=workflow) + if folder_id_workflow is None: + print(f"create workflow folder {workflow} in folder id {folder_id_root}") + folder_id_workflow = self._create_folder(parent_folder_id=folder_id_root, name=workflow) + + # create campaign folder if necessary + folder_id_campaign = self._get_folder_id(workflow=workflow, campaign=campaign) + if folder_id_campaign is None: + print(f"create campaign folder inside {folder_id_workflow}") + folder_id_campaign = self._create_folder(parent_folder_id=folder_id_workflow, name=campaign) + + # create dataset folder if necessary + folder_id_dataset = self._get_folder_id(workflow=workflow, campaign=campaign, openmlid=openmlid) + if folder_id_dataset is None: + print(f"create dataset folder inside {folder_id_campaign}") + folder_id = self._create_folder(parent_folder_id=folder_id_campaign, name=openmlid) + + # Create a BytesIO object to hold the CSV in binary format + csv_buffer = io.BytesIO() + + # Write the DataFrame to the buffer in CSV format, but use StringIO first to handle text conversion + csv_string = df.to_csv(index=False) + + # Compress the CSV data using gzip + with gzip.GzipFile(fileobj=csv_buffer, mode='wb') as gz: + gz.write(csv_string.encode('utf-8')) # Compress the CSV string (convert it to bytes first) + + # Reset the buffer's position to the beginning + csv_buffer.seek(0) + + # upload the file + url = f"https://api.pcloud.com/uploadfile?code={self.repo_code}&auth={self.token}&folderid={folder_id}&filename={name}" + status = requests.post(url, files={'file': (name, csv_buffer, 'application/gzip')}).json() + if not isinstance(status, dict): + raise ValueError( + f"Could not add result. Object received from pCloud should be a dict but is {type(status)}" + ) + if status["result"] != 0: + raise ValueError(f"Could not add result. Received an error response from pCloud: {status}") def get_workflows(self): return jmespath.compile("metadata.contents[? name == 'data'] | [0] .contents | [*].name").search(self.content) @@ -143,7 +255,7 @@ def get_result_files_of_workflow_in_campaign( def get_result_files_of_workflow( self, - workflow, + workflow=None, campaigns=None, openmlids=None, workflow_seeds=None,