Skip to content

Commit

Permalink
Merge pull request #27 from monarch-initiative/s3
Browse files Browse the repository at this point in the history
Local file to S3 upload.
  • Loading branch information
hrshdhgd authored Feb 9, 2024
2 parents 2c7d1cb + fb474f9 commit 5b9ee7a
Show file tree
Hide file tree
Showing 9 changed files with 901 additions and 392 deletions.
6 changes: 6 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Available options are:
- `http(s)`
- Google Cloud Storage (`gs://`)
- Google Drive (`gdrive://` or https://drive.google.com/...). The file must be publicly accessible.
- Amazon AWS S3 bucket (`s3://`)
- **local_name**: The name to save the file as locally
- **tag**: A tag to use to filter downloads
- **api**: The API to use to download the file. Currently supported: `elasticsearch`
Expand All @@ -36,6 +37,11 @@ Available options are:
> - [add the service account to the relevant bucket](https://cloud.google.com/storage/docs/access-control/using-iam-permissions#bucket-iam) and
> - [download a JSON key](https://cloud.google.com/iam/docs/keys-create-delete) for that service account.
> Then, set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to point to that file.
>
> Mirorring local files to Amazon AWS S3 bucket requires the following:
> - [Create an AWS account](https://portal.aws.amazon.com/)
> - [Create an IAM user in AWS](https://docs.aws.amazon.com/IAM/latest/UserGuide/getting-started.html): This enables getting the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` needed for authentication. These two shoul dbe stored as environment variables in the user's system.
> - [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html): This will be the destination for pushing local files.
You can also include any secrets like API keys you have set as environment variables using `{VARIABLE_NAME}`, for example:
```yaml
Expand Down
3 changes: 3 additions & 0 deletions example/download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
- url: https://drive.google.com/uc?id=10ojJffrPSl12OMcu4gyx0fak2CNu6qOs
local_name: gdrive_test_2.txt

- url: s3://monarch-kg-test/kghub_downloader_test_file.yaml
local_name: test_file.yaml

# - url: https://www.ebi.ac.uk/chembl/elk/es/
# api: elasticsearch
# query_file: example/query.json
Expand Down
59 changes: 40 additions & 19 deletions kghub_downloader/download_utils.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
import os, pathlib, re
import json
import logging

import json, yaml
import compress_json # type: ignore

# from compress_json import compress_json

import os
import pathlib
import re
from multiprocessing.sharedctypes import Value

from typing import List, Optional
from urllib.error import URLError
from urllib.request import Request, urlopen


import boto3
import compress_json # type: ignore
import elasticsearch
import elasticsearch.helpers

from tqdm.auto import tqdm # type: ignore
import gdown
import yaml
from botocore.exceptions import NoCredentialsError
from google.cloud import storage
from google.cloud.storage.blob import Blob
from typing import List, Optional
import gdown
from tqdm.auto import tqdm # type: ignore

# from compress_json import compress_json


GDOWN_MAP = {"gdrive": "https://drive.google.com/uc?id="}

Expand Down Expand Up @@ -104,6 +105,11 @@ def download_from_yaml(
Blob.from_string(url, client=storage.Client()).download_to_filename(
outfile
)
elif url.startswith("s3://"):
s3 = boto3.client("s3")
bucket_name = url.split("/")[2]
remote_file = "/".join(url.split("/")[3:])
s3.download_file(bucket_name, remote_file, outfile)
elif any(
url.startswith(str(i))
for i in list(GDOWN_MAP.keys()) + list(GDOWN_MAP.values())
Expand Down Expand Up @@ -157,6 +163,8 @@ def download_from_yaml(


def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:
bucket_split = bucket_url.split("/")
bucket_name = bucket_split[2]
with open(local_file, "rb"):
if bucket_url.startswith("gs://"):

Expand All @@ -165,8 +173,6 @@ def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:

# Connect to GCS Bucket
storage_client = storage.Client()
bucket_split = bucket_url.split("/")
bucket_name = bucket_split[2]
bucket = storage_client.bucket(bucket_name)

# Upload blob from local file
Expand All @@ -188,12 +194,27 @@ def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:
blob.upload_from_filename(local_file)

elif bucket_url.startswith("s3://"):
raise ValueError("Currently, only Google Cloud storage is supported.")
# bashCommand = f"aws s3 cp {outfile} {mirror}"
# subprocess.run(bashCommand.split())
# Create an S3 client
s3 = boto3.client("s3")

try:
# Upload the file
# ! This will only work if the user has the AWS IAM user
# ! access keys set up as environment variables.
s3.upload_file(local_file, bucket_name, remote_file)
print(f"File {local_file} uploaded to {bucket_name}/{remote_file}")
return True
except FileNotFoundError:
print(f"The file {local_file} was not found")
return False
except NoCredentialsError:
print("Credentials not available")
return False

else:
raise ValueError("Currently, only Google Cloud storage is supported.")
raise ValueError(
"Currently, only Google Cloud and S3 storage is supported."
)

return None

Expand Down
3 changes: 2 additions & 1 deletion kghub_downloader/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional, List
from typing import List, Optional

import typer

from kghub_downloader.download_utils import download_from_yaml

typer_app = typer.Typer()
Expand Down
Loading

0 comments on commit 5b9ee7a

Please sign in to comment.