diff --git a/app/bfworker.py b/app/bfworker.py new file mode 100644 index 0000000..aa590f9 --- /dev/null +++ b/app/bfworker.py @@ -0,0 +1,71 @@ +from pennsieve import Pennsieve +import pennsieve +from app.config import Config + +class BFWorker(object): + def __init__(self, id): + self.bf = Pennsieve(api_token=Config.PENNSIEVE_API_TOKEN, api_secret=Config.PENNSIEVE_API_SECRET) + + + def getCollectionAndMetaFromPackageId(self, packageId): + pkg = self.bf.get(packageId) + if type(pkg) is pennsieve.DataPackage: + colId = pkg.parent + col = self.bf.get(colId) + items = col.items + for item in items: + if packageId == item.id: + return [colId, item.name] + return None + + def getURLFromCollectionIdAndFileName(self, collectionId, fileName): + col = self.bf.get(collectionId) + if type(col) is pennsieve.Collection: + items = col.items + for item in items: + if fileName == item.name: + pkg = item + try: + bfFile = pkg.files[0] + url = bfFile.url + return url + except: + return None + return None + + def getUrlfromPackageId(self, packageId): + pId = packageId + if ('N:' not in packageId): + pId = 'N:' + packageId + pk = self.bf.get(pId) + return pk.files[0].url + + def getImagefromPackageId(self, packageId): + pId = packageId + if ('N:' not in packageId): + pId = 'N:' + packageId + pk = self.bf.get(pId) + # resp = requests.get(pk.files[0].url) + return pk.files[0].url if pk is not None else '' + + def getURLFromDatasetIdAndFilePath(self, datasetId, filePath): + fileArray = filePath.split('/') + fileArray = list(filter(lambda f: not f.isdigit(), fileArray)) + items = self.bf.get_dataset(datasetId).items + count = 0 + depth = 0 + while type(items) is list: + item = items[count] + if fileArray[depth] == item.name: + if type(item) is pennsieve.Collection: + items = item.items + count = -1 + depth += 1 + continue + else: + try: + return item.files[0].url + except: + return None + count += 1 + return None diff --git a/app/config.py b/app/config.py index 3974e1f..0382093 100644 --- a/app/config.py +++ b/app/config.py @@ -4,7 +4,7 @@ class Config(object): - PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST") + PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST", "https://api.pennsieve.io") PENNSIEVE_API_SECRET = os.environ.get("PENNSIEVE_API_SECRET", "local-secret-key") PENNSIEVE_API_TOKEN = os.environ.get("PENNSIEVE_API_TOKEN", "local-api-key") PENNSIEVE_EMBARGO_TEAM_ID = os.environ.get("PENNSIEVE_EMBARGO_TEAM_ID") @@ -31,7 +31,7 @@ class Config(object): KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key") DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development") SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com") - SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_pr") + SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_stage") MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates") SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates") WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN") diff --git a/app/main.py b/app/main.py index 758f9b5..ddb387f 100644 --- a/app/main.py +++ b/app/main.py @@ -4,6 +4,7 @@ import json import logging import requests +from flask import make_response from apscheduler.schedulers.background import BackgroundScheduler from botocore.exceptions import ClientError @@ -18,7 +19,7 @@ from app.scicrunch_requests import create_doi_query, create_filter_request, create_facet_query, create_doi_aggregate, create_title_query, \ create_identifier_query, create_pennsieve_identifier_query, create_field_query, create_request_body_for_curies, create_onto_term_query, \ - create_multiple_doi_query, create_multiple_discoverId_query + create_multiple_doi_query, create_multiple_discoverId_query, create_pennsieve_id_query from scripts.email_sender import EmailSender, feedback_email, resource_submission_confirmation_email, creation_request_confirmation_email, issue_reporting_email, community_spotlight_submit_form_email, news_and_events_submit_form_email from threading import Lock from xml.etree import ElementTree @@ -31,6 +32,8 @@ from app.utilities import img_to_base64_str from app.osparc import run_simulation from app.biolucida_process_results import process_results as process_biolucida_results +from app.bfworker import BFWorker +from scripts.pennsieve import pennsieve_login, get_banner app = Flask(__name__) # set environment variable @@ -40,6 +43,7 @@ ma = Marshmallow(app) email_sender = EmailSender() +bfWorker = BFWorker(None) ps = None s3 = boto3.client( @@ -249,6 +253,12 @@ def create_presigned_url(expiration=3600): return create_s3_presigned_url(key, content_type, expiration) +@app.route("/get_banner/") +def get_banner_pen(datasetId): + p_temp_key = pennsieve_login() + ban = get_banner(p_temp_key, datasetId) + return ban + @app.route("/thumbnail/neurolucida") def thumbnail_from_neurolucida_file(): @@ -368,33 +378,91 @@ def get_discover_path(): # Reverse proxy for objects from S3, a simple get object # operation. This is used by scaffoldvuer and its -# important to keep the relative for accessing -# other required files. -@app.route("/s3-resource/") +# # important to keep the relative for accessing +# # other required files. +# @app.route("/s3-resource/") +# def direct_download_url(path): +# print(path) +# head_response = s3.head_object( +# Bucket=Config.S3_BUCKET_NAME, +# Key=path, +# RequestPayer="requester" +# ) +# +# content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT) +# if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB +# return abort(413, description=f"File too big to download: {content_length}") +# +# response = s3.get_object( +# Bucket=Config.S3_BUCKET_NAME, +# Key=path, +# RequestPayer="requester" +# ) +# +# encode_base64 = request.args.get("encodeBase64") +# resource = response["Body"].read() +# if encode_base64 is not None: +# return base64.b64encode(resource) +# +# return resource + +# This version of s3-resouces is used for accessing files on staging. Use it as a replacement for 's3-resource' +# No changes are need on the front end, just use s3-resource as normal +# @app.route("/s3-resource/") def direct_download_url(path): - head_response = s3.head_object( - Bucket=Config.S3_BUCKET_NAME, - Key=path, - RequestPayer="requester" - ) + print(path) + filePath = path.split('files/')[-1] + discoverId = path.split('/')[0] + dataset_query = { + "size": 20, + "from": 0, + "query": { + "query_string": { + "fields": [ + "*pennsieve.identifier" + ], + "query": discoverId + } + }, + "_source": [ + "item.identifier" + ] + } + resp = dataset_search(dataset_query) + pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] + url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) + if url != None: + resp2 = requests.get(url) + return resp2.content + return jsonify({'error': 'error with the provided ID '}, status=502) + +# This version of s3-resouces is used for accessing files on staging that have never been published +@app.route("/s3-resource/") +def direct_download_url2(path): + print(path) + filePath = path.split('files/')[-1] + pennsieveId = path.split('/')[0] - content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT) - if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB - return abort(413, description=f"File too big to download: {content_length}") + # If length is small, we have a pennsieve discover id. We will process this one with the normal s3-resource route + if len(pennsieveId) <= 4: + return direct_download_url(path) - response = s3.get_object( - Bucket=Config.S3_BUCKET_NAME, - Key=path, - RequestPayer="requester" - ) + if 'N:package:' not in pennsieveId: + pennsieveId = 'N:dataset:' + pennsieveId - encode_base64 = request.args.get("encodeBase64") - resource = response["Body"].read() - if encode_base64 is not None: - return base64.b64encode(resource) + url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) + if url != None: + resp2 = requests.get(url) + return resp2.content + return jsonify({'error': 'error with the provided ID '}, status=502) - return resource +@app.route("/proxy/") +def proxy(): + url = request.args.get('url') + resp = requests.get(url) + return resp.content + return jsonify({'error': 'error with the provided ID '}, status=502) @app.route("/scicrunch-dataset//") def sci_doi(doi1, doi2): @@ -470,6 +538,44 @@ def get_dataset_info_discoverIds(): return process_results(dataset_search(query)) +@app.route('/urlFromPennsieveDatasetIdAndFilePath/') +def getFileUrlFromPennsieve(discoverId): + filePath = request.args.get('filePath') + dataset_query = { + "size": 20, + "from": 0, + "query": { + "query_string": { + "fields": [ + "*pennsieve.identifier" + ], + "query": discoverId + } + }, + "_source": [ + "item.identifier" + ] + } + resp = dataset_search(dataset_query) + pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier'] + url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath) + if url != None: + return jsonify({'url': url}) + return jsonify({'error': 'error with the provided ID '}, status=502) + + +@app.route("/dataset_info/using_pennsieveId") +@app.route("/dataset_info/using_pennsieveId/") +def get_dataset_info_pennsieve_id(): + ids = request.args.get('ids') + + if len(ids) > 4: + query = create_pennsieve_id_query(ids) + else: + query = create_multiple_discoverId_query([ids]) + + return process_results(dataset_search(query)) + @app.route("/dataset_info/using_title") def get_dataset_info_title(): diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py index 2f09336..fad0d9c 100644 --- a/app/scicrunch_process_results.py +++ b/app/scicrunch_process_results.py @@ -17,6 +17,10 @@ def _prepare_results(results): #Try to get minimal information out from the datasets version = 'undefined' + if version >= '1.1.5': + print('WARINING! Scicrunch processing is out of date!') + version = '1.1.5' + package_version = f'scicrunch_processing_v_{version.replace(".", "_")}' m = importlib.import_module(f'app.{package_version}') attributes_map = getattr(m, 'ATTRIBUTES_MAP') diff --git a/app/scicrunch_processing_v_1_1_5.py b/app/scicrunch_processing_v_1_1_5.py index 342e0e5..5dda5de 100644 --- a/app/scicrunch_processing_v_1_1_5.py +++ b/app/scicrunch_processing_v_1_1_5.py @@ -5,7 +5,7 @@ from app.scicrunch_processing_common import PASS_THROUGH_KEYS as BASE_PASS_THROUGH_KEYS from app.manifest_name_to_discover_name import name_map -PASS_THROUGH_KEYS = ["doi", "dataset_identifier", "dataset_version", "dataset_revision", *BASE_PASS_THROUGH_KEYS] +PASS_THROUGH_KEYS = ["doi", "dataset_identifier", "dataset_version", "dataset_revision", "keywords", *BASE_PASS_THROUGH_KEYS] # attributes is used to map desired parameters onto the path of keys needed in the sci-crunch response. @@ -17,6 +17,7 @@ 'sampleSize': ['item', 'statistics', 'samples', 'count'], 'subjectSize': ['item', 'statistics', 'subjects', 'count'], 'name': ['item', 'name'], + 'keywords': ['item', 'keywords'], 'description': ['item', 'description'], 'identifier': ['item', 'identifier'], 'uri': ['distributions', 'current', 'uri'], diff --git a/app/scicrunch_requests.py b/app/scicrunch_requests.py index de96891..fdb790a 100644 --- a/app/scicrunch_requests.py +++ b/app/scicrunch_requests.py @@ -1,3 +1,4 @@ +import json def create_query_string(query_string): return { "from": 0, @@ -160,6 +161,22 @@ def create_doi_request(doi): return query +def create_pennsieve_id_query(pennseiveId): + query = { + "size": 50, + "from": 0, + "query": { + "term": { + "item.identifier.aggregate": { + "value": f"N:dataset:{pennseiveId}" + } + } + } + } + + print(query) + return query + # create_facet_query(type): Generates facet search request data for sci-crunch given a 'type'; where # 'type' is either 'species', 'gender', or 'organ' at this stage. # Returns a tuple of the type-map and request data ( type_map, data ) diff --git a/scripts/pennsieve.py b/scripts/pennsieve.py new file mode 100644 index 0000000..6b1b5fb --- /dev/null +++ b/scripts/pennsieve.py @@ -0,0 +1,41 @@ +import logging +import boto3 +from app.config import Config +import requests +import json + + + +# Returns pennsieve api token valid for 24 hours + +def pennsieve_login(): + r = requests.get(f"{Config.PENNSIEVE_API_HOST}/authentication/cognito-config") + r.raise_for_status() + + cognito_app_client_id = r.json()["tokenPool"]["appClientId"] + cognito_region = r.json()["region"] + + cognito_idp_client = boto3.client( + "cognito-idp", + region_name=cognito_region, + aws_access_key_id=Config.SPARC_PORTAL_AWS_KEY, + aws_secret_access_key=Config.SPARC_PORTAL_AWS_SECRET, + ) + + login_response = cognito_idp_client.initiate_auth( + AuthFlow="USER_PASSWORD_AUTH", + AuthParameters={"USERNAME": Config.PENNSIEVE_API_TOKEN, "PASSWORD": Config.PENNSIEVE_API_SECRET}, + ClientId=cognito_app_client_id, + ) + + api_key = login_response["AuthenticationResult"]["AccessToken"] + return api_key + + + +def get_banner(pennsieve_temp_api_key, dataset_id): + print(f"{Config.PENNSIEVE_API_HOST}/datasets/N%3Adataset%3A{dataset_id}/banner?api_key={pennsieve_temp_api_key}") + r = requests.get(f"{Config.PENNSIEVE_API_HOST}/datasets/N%3Adataset%3A{dataset_id}/banner", + headers={"Authorization": f"Bearer {pennsieve_temp_api_key}"}) + r.raise_for_status() + return r.json() diff --git a/tests/test_api.py b/tests/test_api.py index 017137d..095ae32 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -22,6 +22,14 @@ def test_direct_download_url_small_file(client): assert r.status_code == 200 assert b"proximal colon" in r.data +def test_pennsieve_file_path_download(client): + colon_dataset_id = 76 + colon_file_path = 'derivative%2Fscaffold_context_info.json' + r = client.get(f"/urlFromPennsieveDatasetIdAndFilePath/{colon_dataset_id}?filePath={colon_file_path}") + assert r.status_code == 200 + assert 'url' in r.json + + def test_direct_download_url_thumbnail(client): small_s3_file = '95/1/files/derivative%2FScaffold%2Fthumbnail.png'