Merge pull request #27 from monarch-initiative/s3

Local file to S3 upload.
monarch-initiative · Feb 9, 2024 · 5b9ee7a · 5b9ee7a
2 parents 2c7d1cb + fb474f9
commit 5b9ee7a
Show file tree

Hide file tree

Showing 9 changed files with 901 additions and 392 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -23,6 +23,7 @@ Available options are:
   - `http(s)`
   - Google Cloud Storage (`gs://`)
   - Google Drive (`gdrive://` or https://drive.google.com/...). The file must be publicly accessible.
+  - Amazon AWS S3 bucket (`s3://`)
 - **local_name**: The name to save the file as locally
 - **tag**: A tag to use to filter downloads
 - **api**: The API to use to download the file. Currently supported: `elasticsearch`
@@ -36,6 +37,11 @@ Available options are:
 > - [add the service account to the relevant bucket](https://cloud.google.com/storage/docs/access-control/using-iam-permissions#bucket-iam) and  
 > - [download a JSON key](https://cloud.google.com/iam/docs/keys-create-delete) for that service account.  
 >  Then, set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to point to that file.
+>
+> Mirorring local files to Amazon AWS S3 bucket requires the following:
+>  - [Create an AWS account](https://portal.aws.amazon.com/)
+>  - [Create an IAM user in AWS](https://docs.aws.amazon.com/IAM/latest/UserGuide/getting-started.html): This enables getting the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` needed for authentication. These two shoul dbe stored as environment variables in the user's system.
+>  - [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html): This will be the destination for pushing local files.
 
 You can also include any secrets like API keys you have set as environment variables using `{VARIABLE_NAME}`, for example:  
 ```yaml

diff --git a/example/download.yaml b/example/download.yaml
@@ -12,6 +12,9 @@
 - url: https://drive.google.com/uc?id=10ojJffrPSl12OMcu4gyx0fak2CNu6qOs
   local_name: gdrive_test_2.txt
 
+- url: s3://monarch-kg-test/kghub_downloader_test_file.yaml
+  local_name: test_file.yaml
+
 # - url: https://www.ebi.ac.uk/chembl/elk/es/
 #   api: elasticsearch
 #   query_file: example/query.json

diff --git a/kghub_downloader/download_utils.py b/kghub_downloader/download_utils.py
@@ -1,25 +1,26 @@
-import os, pathlib, re
+import json
 import logging
-
-import json, yaml
-import compress_json  # type: ignore
-
-# from compress_json import compress_json
-
+import os
+import pathlib
+import re
 from multiprocessing.sharedctypes import Value
-
+from typing import List, Optional
 from urllib.error import URLError
 from urllib.request import Request, urlopen
 
-
+import boto3
+import compress_json  # type: ignore
 import elasticsearch
 import elasticsearch.helpers
-
-from tqdm.auto import tqdm  # type: ignore
+import gdown
+import yaml
+from botocore.exceptions import NoCredentialsError
 from google.cloud import storage
 from google.cloud.storage.blob import Blob
-from typing import List, Optional
-import gdown
+from tqdm.auto import tqdm  # type: ignore
+
+# from compress_json import compress_json
+
 
 GDOWN_MAP = {"gdrive": "https://drive.google.com/uc?id="}
 
@@ -104,6 +105,11 @@ def download_from_yaml(
                     Blob.from_string(url, client=storage.Client()).download_to_filename(
                         outfile
                     )
+                elif url.startswith("s3://"):
+                    s3 = boto3.client("s3")
+                    bucket_name = url.split("/")[2]
+                    remote_file = "/".join(url.split("/")[3:])
+                    s3.download_file(bucket_name, remote_file, outfile)
                 elif any(
                     url.startswith(str(i))
                     for i in list(GDOWN_MAP.keys()) + list(GDOWN_MAP.values())
@@ -157,6 +163,8 @@ def download_from_yaml(
 
 
 def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:
+    bucket_split = bucket_url.split("/")
+    bucket_name = bucket_split[2]
     with open(local_file, "rb"):
         if bucket_url.startswith("gs://"):
 
@@ -165,8 +173,6 @@ def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:
 
             # Connect to GCS Bucket
             storage_client = storage.Client()
-            bucket_split = bucket_url.split("/")
-            bucket_name = bucket_split[2]
             bucket = storage_client.bucket(bucket_name)
 
             # Upload blob from local file
@@ -188,12 +194,27 @@ def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:
             blob.upload_from_filename(local_file)
 
         elif bucket_url.startswith("s3://"):
-            raise ValueError("Currently, only Google Cloud storage is supported.")
-            # bashCommand = f"aws s3 cp {outfile} {mirror}"
-            # subprocess.run(bashCommand.split())
+            # Create an S3 client
+            s3 = boto3.client("s3")
+
+            try:
+                # Upload the file
+                # ! This will only work if the user has the AWS IAM user
+                # ! access keys set up as environment variables.
+                s3.upload_file(local_file, bucket_name, remote_file)
+                print(f"File {local_file} uploaded to {bucket_name}/{remote_file}")
+                return True
+            except FileNotFoundError:
+                print(f"The file {local_file} was not found")
+                return False
+            except NoCredentialsError:
+                print("Credentials not available")
+                return False
 
         else:
-            raise ValueError("Currently, only Google Cloud storage is supported.")
+            raise ValueError(
+                "Currently, only Google Cloud and S3 storage is supported."
+            )
 
     return None
 

diff --git a/kghub_downloader/main.py b/kghub_downloader/main.py
@@ -1,6 +1,7 @@
-from typing import Optional, List
+from typing import List, Optional
 
 import typer
+
 from kghub_downloader.download_utils import download_from_yaml
 
 typer_app = typer.Typer()