Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Staging sparc api nbpd #157

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions app/bfworker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from pennsieve import Pennsieve
import pennsieve
from app.config import Config

class BFWorker(object):
def __init__(self, id):
self.bf = Pennsieve(api_token=Config.PENNSIEVE_API_TOKEN, api_secret=Config.PENNSIEVE_API_SECRET)


def getCollectionAndMetaFromPackageId(self, packageId):
pkg = self.bf.get(packageId)
if type(pkg) is pennsieve.DataPackage:
colId = pkg.parent
col = self.bf.get(colId)
items = col.items
for item in items:
if packageId == item.id:
return [colId, item.name]
return None

def getURLFromCollectionIdAndFileName(self, collectionId, fileName):
col = self.bf.get(collectionId)
if type(col) is pennsieve.Collection:
items = col.items
for item in items:
if fileName == item.name:
pkg = item
try:
bfFile = pkg.files[0]
url = bfFile.url
return url
except:
return None
return None

def getUrlfromPackageId(self, packageId):
pId = packageId
if ('N:' not in packageId):
pId = 'N:' + packageId
pk = self.bf.get(pId)
return pk.files[0].url

def getImagefromPackageId(self, packageId):
pId = packageId
if ('N:' not in packageId):
pId = 'N:' + packageId
pk = self.bf.get(pId)
# resp = requests.get(pk.files[0].url)
return pk.files[0].url if pk is not None else ''

def getURLFromDatasetIdAndFilePath(self, datasetId, filePath):
fileArray = filePath.split('/')
fileArray = list(filter(lambda f: not f.isdigit(), fileArray))
items = self.bf.get_dataset(datasetId).items
count = 0
depth = 0
while type(items) is list:
item = items[count]
if fileArray[depth] == item.name:
if type(item) is pennsieve.Collection:
items = item.items
count = -1
depth += 1
continue
else:
try:
return item.files[0].url
except:
return None
count += 1
return None
4 changes: 2 additions & 2 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class Config(object):
PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST")
PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST", "https://api.pennsieve.io")
PENNSIEVE_API_SECRET = os.environ.get("PENNSIEVE_API_SECRET", "local-secret-key")
PENNSIEVE_API_TOKEN = os.environ.get("PENNSIEVE_API_TOKEN", "local-api-key")
PENNSIEVE_EMBARGO_TEAM_ID = os.environ.get("PENNSIEVE_EMBARGO_TEAM_ID")
Expand All @@ -31,7 +31,7 @@ class Config(object):
KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key")
DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development")
SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_pr")
SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_stage")
MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates")
SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates")
WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN")
Expand Down
150 changes: 128 additions & 22 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import logging
import requests
from flask import make_response

from apscheduler.schedulers.background import BackgroundScheduler
from botocore.exceptions import ClientError
Expand All @@ -18,7 +19,7 @@

from app.scicrunch_requests import create_doi_query, create_filter_request, create_facet_query, create_doi_aggregate, create_title_query, \
create_identifier_query, create_pennsieve_identifier_query, create_field_query, create_request_body_for_curies, create_onto_term_query, \
create_multiple_doi_query, create_multiple_discoverId_query
create_multiple_doi_query, create_multiple_discoverId_query, create_pennsieve_id_query
from scripts.email_sender import EmailSender, feedback_email, resource_submission_confirmation_email, creation_request_confirmation_email, issue_reporting_email, community_spotlight_submit_form_email, news_and_events_submit_form_email
from threading import Lock
from xml.etree import ElementTree
Expand All @@ -31,6 +32,8 @@
from app.utilities import img_to_base64_str
from app.osparc import run_simulation
from app.biolucida_process_results import process_results as process_biolucida_results
from app.bfworker import BFWorker
from scripts.pennsieve import pennsieve_login, get_banner

app = Flask(__name__)
# set environment variable
Expand All @@ -40,6 +43,7 @@

ma = Marshmallow(app)
email_sender = EmailSender()
bfWorker = BFWorker(None)

ps = None
s3 = boto3.client(
Expand Down Expand Up @@ -249,6 +253,12 @@ def create_presigned_url(expiration=3600):

return create_s3_presigned_url(key, content_type, expiration)

@app.route("/get_banner/<datasetId>")
def get_banner_pen(datasetId):
p_temp_key = pennsieve_login()
ban = get_banner(p_temp_key, datasetId)
return ban


@app.route("/thumbnail/neurolucida")
def thumbnail_from_neurolucida_file():
Expand Down Expand Up @@ -368,33 +378,91 @@ def get_discover_path():

# Reverse proxy for objects from S3, a simple get object
# operation. This is used by scaffoldvuer and its
# important to keep the relative <path> for accessing
# other required files.
@app.route("/s3-resource/<path:path>")
# # important to keep the relative <path> for accessing
# # other required files.
# @app.route("/s3-resource/<path:path>")
# def direct_download_url(path):
# print(path)
# head_response = s3.head_object(
# Bucket=Config.S3_BUCKET_NAME,
# Key=path,
# RequestPayer="requester"
# )
#
# content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
# if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB
# return abort(413, description=f"File too big to download: {content_length}")
#
# response = s3.get_object(
# Bucket=Config.S3_BUCKET_NAME,
# Key=path,
# RequestPayer="requester"
# )
#
# encode_base64 = request.args.get("encodeBase64")
# resource = response["Body"].read()
# if encode_base64 is not None:
# return base64.b64encode(resource)
#
# return resource

# This version of s3-resouces is used for accessing files on staging. Use it as a replacement for 's3-resource'
# No changes are need on the front end, just use s3-resource as normal
# @app.route("/s3-resource/<path:path>")
def direct_download_url(path):
head_response = s3.head_object(
Bucket=Config.S3_BUCKET_NAME,
Key=path,
RequestPayer="requester"
)
print(path)
filePath = path.split('files/')[-1]
discoverId = path.split('/')[0]
dataset_query = {
"size": 20,
"from": 0,
"query": {
"query_string": {
"fields": [
"*pennsieve.identifier"
],
"query": discoverId
}
},
"_source": [
"item.identifier"
]
}
resp = dataset_search(dataset_query)
pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
if url != None:
resp2 = requests.get(url)
return resp2.content
return jsonify({'error': 'error with the provided ID '}, status=502)

# This version of s3-resouces is used for accessing files on staging that have never been published
@app.route("/s3-resource/<path:path>")
def direct_download_url2(path):
print(path)
filePath = path.split('files/')[-1]
pennsieveId = path.split('/')[0]

content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT: # 20 MB
return abort(413, description=f"File too big to download: {content_length}")
# If length is small, we have a pennsieve discover id. We will process this one with the normal s3-resource route
if len(pennsieveId) <= 4:
return direct_download_url(path)

response = s3.get_object(
Bucket=Config.S3_BUCKET_NAME,
Key=path,
RequestPayer="requester"
)
if 'N:package:' not in pennsieveId:
pennsieveId = 'N:dataset:' + pennsieveId

encode_base64 = request.args.get("encodeBase64")
resource = response["Body"].read()
if encode_base64 is not None:
return base64.b64encode(resource)
url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
if url != None:
resp2 = requests.get(url)
return resp2.content
return jsonify({'error': 'error with the provided ID '}, status=502)

return resource

@app.route("/proxy/")
def proxy():
url = request.args.get('url')
resp = requests.get(url)
return resp.content
return jsonify({'error': 'error with the provided ID '}, status=502)

@app.route("/scicrunch-dataset/<doi1>/<doi2>")
def sci_doi(doi1, doi2):
Expand Down Expand Up @@ -470,6 +538,44 @@ def get_dataset_info_discoverIds():

return process_results(dataset_search(query))

@app.route('/urlFromPennsieveDatasetIdAndFilePath/<discoverId>')
def getFileUrlFromPennsieve(discoverId):
filePath = request.args.get('filePath')
dataset_query = {
"size": 20,
"from": 0,
"query": {
"query_string": {
"fields": [
"*pennsieve.identifier"
],
"query": discoverId
}
},
"_source": [
"item.identifier"
]
}
resp = dataset_search(dataset_query)
pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
if url != None:
return jsonify({'url': url})
return jsonify({'error': 'error with the provided ID '}, status=502)


@app.route("/dataset_info/using_pennsieveId")
@app.route("/dataset_info/using_pennsieveId/")
def get_dataset_info_pennsieve_id():
ids = request.args.get('ids')

if len(ids) > 4:
query = create_pennsieve_id_query(ids)
else:
query = create_multiple_discoverId_query([ids])

return process_results(dataset_search(query))


@app.route("/dataset_info/using_title")
def get_dataset_info_title():
Expand Down
4 changes: 4 additions & 0 deletions app/scicrunch_process_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ def _prepare_results(results):
#Try to get minimal information out from the datasets
version = 'undefined'

if version >= '1.1.5':
print('WARINING! Scicrunch processing is out of date!')
version = '1.1.5'

package_version = f'scicrunch_processing_v_{version.replace(".", "_")}'
m = importlib.import_module(f'app.{package_version}')
attributes_map = getattr(m, 'ATTRIBUTES_MAP')
Expand Down
3 changes: 2 additions & 1 deletion app/scicrunch_processing_v_1_1_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from app.scicrunch_processing_common import PASS_THROUGH_KEYS as BASE_PASS_THROUGH_KEYS
from app.manifest_name_to_discover_name import name_map

PASS_THROUGH_KEYS = ["doi", "dataset_identifier", "dataset_version", "dataset_revision", *BASE_PASS_THROUGH_KEYS]
PASS_THROUGH_KEYS = ["doi", "dataset_identifier", "dataset_version", "dataset_revision", "keywords", *BASE_PASS_THROUGH_KEYS]


# attributes is used to map desired parameters onto the path of keys needed in the sci-crunch response.
Expand All @@ -17,6 +17,7 @@
'sampleSize': ['item', 'statistics', 'samples', 'count'],
'subjectSize': ['item', 'statistics', 'subjects', 'count'],
'name': ['item', 'name'],
'keywords': ['item', 'keywords'],
'description': ['item', 'description'],
'identifier': ['item', 'identifier'],
'uri': ['distributions', 'current', 'uri'],
Expand Down
17 changes: 17 additions & 0 deletions app/scicrunch_requests.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
def create_query_string(query_string):
return {
"from": 0,
Expand Down Expand Up @@ -160,6 +161,22 @@ def create_doi_request(doi):
return query


def create_pennsieve_id_query(pennseiveId):
query = {
"size": 50,
"from": 0,
"query": {
"term": {
"item.identifier.aggregate": {
"value": f"N:dataset:{pennseiveId}"
}
}
}
}

print(query)
return query

# create_facet_query(type): Generates facet search request data for sci-crunch given a 'type'; where
# 'type' is either 'species', 'gender', or 'organ' at this stage.
# Returns a tuple of the type-map and request data ( type_map, data )
Expand Down
41 changes: 41 additions & 0 deletions scripts/pennsieve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import logging
import boto3
from app.config import Config
import requests
import json



# Returns pennsieve api token valid for 24 hours

def pennsieve_login():
r = requests.get(f"{Config.PENNSIEVE_API_HOST}/authentication/cognito-config")
r.raise_for_status()

cognito_app_client_id = r.json()["tokenPool"]["appClientId"]
cognito_region = r.json()["region"]

cognito_idp_client = boto3.client(
"cognito-idp",
region_name=cognito_region,
aws_access_key_id=Config.SPARC_PORTAL_AWS_KEY,
aws_secret_access_key=Config.SPARC_PORTAL_AWS_SECRET,
)

login_response = cognito_idp_client.initiate_auth(
AuthFlow="USER_PASSWORD_AUTH",
AuthParameters={"USERNAME": Config.PENNSIEVE_API_TOKEN, "PASSWORD": Config.PENNSIEVE_API_SECRET},
ClientId=cognito_app_client_id,
)

api_key = login_response["AuthenticationResult"]["AccessToken"]
return api_key



def get_banner(pennsieve_temp_api_key, dataset_id):
print(f"{Config.PENNSIEVE_API_HOST}/datasets/N%3Adataset%3A{dataset_id}/banner?api_key={pennsieve_temp_api_key}")
r = requests.get(f"{Config.PENNSIEVE_API_HOST}/datasets/N%3Adataset%3A{dataset_id}/banner",
headers={"Authorization": f"Bearer {pennsieve_temp_api_key}"})
r.raise_for_status()
return r.json()
Loading