From bd1a8f4f19bddd7c953d41c9d69516c45aaca9ea Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Tue, 22 Mar 2022 16:11:36 +1300
Subject: [PATCH 01/13] Create a version of sparc-api that can be used with the
 'stage' index

---
 app/bfworker.py                  |  69 ++++++++++++++++++++
 app/config.py                    |   2 +-
 app/main.py                      | 105 ++++++++++++++++++++++++-------
 app/scicrunch_process_results.py |   4 ++
 app/scicrunch_requests.py        |   9 +++
 tests/test_api.py                |   8 +++
 6 files changed, 172 insertions(+), 25 deletions(-)
 create mode 100644 app/bfworker.py

diff --git a/app/bfworker.py b/app/bfworker.py
new file mode 100644
index 0000000..8c3119d
--- /dev/null
+++ b/app/bfworker.py
@@ -0,0 +1,69 @@
+from pennsieve import Pennsieve
+import pennsieve
+from app.config import Config
+
+class BFWorker(object):
+  def __init__(self, id):
+    self.bf = Pennsieve(api_token=Config.PENNSIEVE_API_TOKEN, api_secret=Config.PENNSIEVE_API_SECRET)
+
+
+  def getCollectionAndMetaFromPackageId(self, packageId):
+    pkg = self.bf.get(packageId)
+    if type(pkg) is pennsieve.DataPackage:
+      colId = pkg.parent
+      col = self.bf.get(colId)
+      items = col.items
+      for item in items:
+        if packageId == item.id:
+          return [colId, item.name]
+    return None
+
+  def getURLFromCollectionIdAndFileName(self, collectionId, fileName):
+    col = self.bf.get(collectionId)
+    if type(col) is pennsieve.Collection:
+      items = col.items
+      for item in items:
+        if fileName == item.name:
+          pkg = item
+          try:
+            bfFile = pkg.files[0]
+            url = bfFile.url
+            return url
+          except:
+            return None
+    return None
+
+  def getUrlfromPackageId(self, packageId):
+    pId = packageId
+    if ('N:' not in packageId):
+      pId = 'N:' + packageId
+    pk = self.bf.get(pId)
+    return pk.files[0].url
+
+  def getImagefromPackageId(self, packageId):
+    pId = packageId
+    if ('N:' not in packageId):
+      pId = 'N:' + packageId
+    pk = self.bf.get(pId)
+    # resp = requests.get(pk.files[0].url)
+    return pk.files[0].url if pk is not None else ''
+
+  def getURLFromDatasetIdAndFilePath(self, datasetId, filePath):
+      fileArray = filePath.split('/')
+      items = self.bf.get_dataset(datasetId).items
+      count = 0
+      while type(items) is list:
+          item = items[count]
+          for fileName in fileArray:
+              if fileName == item.name:
+                  if type(item) is pennsieve.Collection:
+                      items = item.items
+                      count = 0
+                      continue
+                  else:
+                      try:
+                          return item.files[0].url
+                      except:
+                          return None
+          count += 1
+      return None
diff --git a/app/config.py b/app/config.py
index e5b9f6f..df21497 100644
--- a/app/config.py
+++ b/app/config.py
@@ -30,7 +30,7 @@ class Config(object):
     KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key")
     DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development")
     SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com")
-    SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_pr")
+    SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_stage")
     MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates")
     SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates")
     WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN")
diff --git a/app/main.py b/app/main.py
index b514258..eb7d500 100644
--- a/app/main.py
+++ b/app/main.py
@@ -30,6 +30,7 @@
 from app.utilities import img_to_base64_str
 from app.osparc import run_simulation
 from app.biolucida_process_results import process_results as process_biolucida_results
+from app.bfworker import BFWorker
 
 app = Flask(__name__)
 # set environment variable
@@ -39,6 +40,7 @@
 
 ma = Marshmallow(app)
 email_sender = EmailSender()
+bfWorker = BFWorker(None)
 
 ps = None
 s3 = boto3.client(
@@ -317,32 +319,63 @@ def presign_resource_url():
 
 # Reverse proxy for objects from S3, a simple get object
 # operation. This is used by scaffoldvuer and its
-# important to keep the relative <path> for accessing
-# other required files.
+# # important to keep the relative <path> for accessing
+# # other required files.
+# @app.route("/s3-resource/<path:path>")
+# def direct_download_url(path):
+#     print(path)
+#     head_response = s3.head_object(
+#         Bucket=Config.S3_BUCKET_NAME,
+#         Key=path,
+#         RequestPayer="requester"
+#     )
+#
+#     content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
+#     if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT:  # 20 MB
+#         return abort(413, description=f"File too big to download: {content_length}")
+#
+#     response = s3.get_object(
+#         Bucket=Config.S3_BUCKET_NAME,
+#         Key=path,
+#         RequestPayer="requester"
+#     )
+#
+#     encode_base64 = request.args.get("encodeBase64")
+#     resource = response["Body"].read()
+#     if encode_base64 is not None:
+#         return base64.b64encode(resource)
+#
+#     return resource
+
+# This version of s3-resouces is used for accessing files on staging. Use it as a replacement for 's3-resource'
+# No changes are need on the front end, just use s3-resource as normal
 @app.route("/s3-resource/<path:path>")
 def direct_download_url(path):
-    head_response = s3.head_object(
-        Bucket=Config.S3_BUCKET_NAME,
-        Key=path,
-        RequestPayer="requester"
-    )
-
-    content_length = head_response.get('ContentLength', Config.DIRECT_DOWNLOAD_LIMIT)
-    if content_length and content_length > Config.DIRECT_DOWNLOAD_LIMIT:  # 20 MB
-        return abort(413, description=f"File too big to download: {content_length}")
-
-    response = s3.get_object(
-        Bucket=Config.S3_BUCKET_NAME,
-        Key=path,
-        RequestPayer="requester"
-    )
-
-    encode_base64 = request.args.get("encodeBase64")
-    resource = response["Body"].read()
-    if encode_base64 is not None:
-        return base64.b64encode(resource)
-
-    return resource
+    print(path)
+    filePath = path.split('files/')[-1]
+    discoverId = path.split('/')[0]
+    dataset_query = {
+        "size": 20,
+        "from": 0,
+        "query": {
+            "query_string": {
+                "fields": [
+                    "*pennsieve.identifier"
+                ],
+                "query": discoverId
+            }
+        },
+        "_source": [
+            "item.identifier"
+        ]
+    }
+    resp = dataset_search(dataset_query)
+    pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
+    url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
+    if url != None:
+        resp2 = requests.get(url)
+        return resp2.json()
+    return jsonify({'error': 'error with the provided ID '}, status=502)
 
 
 @app.route("/scicrunch-dataset/<doi1>/<doi2>")
@@ -419,6 +452,30 @@ def get_dataset_info_discoverIds():
 
     return process_results(dataset_search(query))
 
+@app.route('/urlFromPennsieveDatasetIdAndFilePath/<discoverId>')
+def getFileUrlFromPennsieve(discoverId):
+    filePath = request.args.get('filePath')
+    dataset_query = {
+        "size": 20,
+        "from": 0,
+        "query": {
+            "query_string": {
+                "fields": [
+                    "*pennsieve.identifier"
+                ],
+                "query": discoverId
+            }
+        },
+        "_source": [
+            "item.identifier"
+        ]
+    }
+    resp = dataset_search(dataset_query)
+    pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
+    url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
+    if url != None:
+        return jsonify({'url': url})
+    return jsonify({'error': 'error with the provided ID '}, status=502)
 
 @app.route("/dataset_info/using_title")
 def get_dataset_info_title():
diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py
index 963e243..070d468 100644
--- a/app/scicrunch_process_results.py
+++ b/app/scicrunch_process_results.py
@@ -27,6 +27,10 @@ def _prepare_results(results):
             for file in hit['_source']['objects']
             if file['additional_mimetype']['name'].find('abi.context-information') is not -1
         ]
+        print([
+            file['additional_mimetype']['name']
+            for file in hit['_source']['objects']
+        ])
         try:
             attr['readme'] = hit['_source']['item']['readme']['description']
         except KeyError:
diff --git a/app/scicrunch_requests.py b/app/scicrunch_requests.py
index 5b0886c..5014406 100644
--- a/app/scicrunch_requests.py
+++ b/app/scicrunch_requests.py
@@ -1,3 +1,4 @@
+import json
 def create_query_string(query_string):
     return {
         "from": 0,
@@ -20,6 +21,14 @@ def create_doi_query(doi):
     }
 
 def create_multiple_doi_query(dois, size=10, from_=0):
+    print(json.dumps({
+        "size": 999,
+        "query": {
+            "terms": {
+                "item.curie": dois
+            }
+        }
+    }))
     return {
         "size": 999,
         "query": {
diff --git a/tests/test_api.py b/tests/test_api.py
index fcfc07a..ffd6193 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -22,6 +22,14 @@ def test_direct_download_url_small_file(client):
     assert r.status_code == 200
     assert b"proximal colon" in r.data
 
+def test_pennsieve_file_path_download(client):
+    colon_dataset_id = 76
+    colon_file_path = 'derivative%2Fscaffold_context_info.json'
+    r = client.get(f"/urlFromPennsieveDatasetIdAndFilePath/{colon_dataset_id}?filePath={colon_file_path}")
+    assert r.status_code == 200
+    assert 'url' in r.json
+
+
 
 def test_direct_download_url_thumbnail(client):
     small_s3_file = '95/1/files/derivative%2FScaffold%2Fthumbnail.png'

From 8089ad99527fb5418db99e630d29cc30afd9e297 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Fri, 25 Mar 2022 00:12:01 +1300
Subject: [PATCH 02/13] Fix issue in logic:

 - First file was getting skipped
 - we only return file contents if the file is json
---
 app/bfworker.py | 2 +-
 app/main.py     | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/app/bfworker.py b/app/bfworker.py
index 8c3119d..de876b6 100644
--- a/app/bfworker.py
+++ b/app/bfworker.py
@@ -58,7 +58,7 @@ def getURLFromDatasetIdAndFilePath(self, datasetId, filePath):
               if fileName == item.name:
                   if type(item) is pennsieve.Collection:
                       items = item.items
-                      count = 0
+                      count = -1
                       continue
                   else:
                       try:
diff --git a/app/main.py b/app/main.py
index eb7d500..e9e714d 100644
--- a/app/main.py
+++ b/app/main.py
@@ -373,8 +373,11 @@ def direct_download_url(path):
     pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
     url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
     if url != None:
-        resp2 = requests.get(url)
-        return resp2.json()
+        if '.json' in path:
+            resp2 = requests.get(url)
+            return resp2.json()
+        else:
+            return url
     return jsonify({'error': 'error with the provided ID '}, status=502)
 
 

From 44bc3565dc5b8c5f75b1390d225030b2362f17c0 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Fri, 25 Mar 2022 13:44:57 +1300
Subject: [PATCH 03/13] Return content for anything but json is s3-resource

---
 app/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/app/main.py b/app/main.py
index e9e714d..1149d0d 100644
--- a/app/main.py
+++ b/app/main.py
@@ -377,7 +377,8 @@ def direct_download_url(path):
             resp2 = requests.get(url)
             return resp2.json()
         else:
-            return url
+            resp2 = requests.get(url)
+            return resp2.content
     return jsonify({'error': 'error with the provided ID '}, status=502)
 
 

From 754316ce582633b1d3c8ef670bc1a94386e04244 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Fri, 1 Jul 2022 14:46:23 +1200
Subject: [PATCH 04/13] A few small fixes

---
 app/config.py                      | 2 +-
 app/main.py                        | 8 ++------
 app/scicrunch_processing_common.py | 1 +
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/app/config.py b/app/config.py
index df21497..1eddca2 100644
--- a/app/config.py
+++ b/app/config.py
@@ -30,7 +30,7 @@ class Config(object):
     KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key")
     DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development")
     SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com")
-    SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_stage")
+    SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev")
     MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates")
     SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates")
     WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN")
diff --git a/app/main.py b/app/main.py
index 1149d0d..d1637ac 100644
--- a/app/main.py
+++ b/app/main.py
@@ -373,12 +373,8 @@ def direct_download_url(path):
     pennsieveId = resp['hits']['hits'][0]['_source']['item']['identifier']
     url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
     if url != None:
-        if '.json' in path:
-            resp2 = requests.get(url)
-            return resp2.json()
-        else:
-            resp2 = requests.get(url)
-            return resp2.content
+        resp2 = requests.get(url)
+        return resp2.content
     return jsonify({'error': 'error with the provided ID '}, status=502)
 
 
diff --git a/app/scicrunch_processing_common.py b/app/scicrunch_processing_common.py
index d141d87..2e5aad4 100644
--- a/app/scicrunch_processing_common.py
+++ b/app/scicrunch_processing_common.py
@@ -30,6 +30,7 @@
     'application/vnd.mbfbioscience.neurolucida+xml': SEGMENTATION_FILES,
     'inode/vnd.abi.scaffold+directory': SCAFFOLD_DIR,
     'inode/vnd.abi.scaffold+file': SCAFFOLD_FILE,
+    'application/x.vnd.abi.scaffold.meta+json': SCAFFOLD_FILE,
     'inode/vnd.abi.scaffold+thumbnail': SCAFFOLD_THUMBNAIL,
     'inode/vnd.abi.scaffold.thumbnail+file': SCAFFOLD_THUMBNAIL,
     'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE,

From faad292764362462a7607d8dfbcac1df2a5c74d9 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Mon, 19 Sep 2022 06:02:12 +0530
Subject: [PATCH 05/13] Add check for future scicrunch processing versions

---
 app/scicrunch_process_results.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/app/scicrunch_process_results.py b/app/scicrunch_process_results.py
index 070d468..36c2302 100644
--- a/app/scicrunch_process_results.py
+++ b/app/scicrunch_process_results.py
@@ -13,6 +13,10 @@ def _prepare_results(results):
         except KeyError:
             continue
 
+        if version >= '1.1.5':
+            print('WARINING! Scicrunch processing is out of date!')
+            version = '1.1.5'
+
         package_version = f'scicrunch_processing_v_{version.replace(".", "_")}'
         m = importlib.import_module(f'app.{package_version}')
         attributes_map = getattr(m, 'ATTRIBUTES_MAP')

From 68c9cbbccf0f8a3cf0df7aa45d0702f7844c79e1 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Wed, 28 Sep 2022 12:41:40 +1300
Subject: [PATCH 06/13] Add support for never been published datasets

---
 app/config.py |  2 +-
 app/main.py   | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/app/config.py b/app/config.py
index 1eddca2..df21497 100644
--- a/app/config.py
+++ b/app/config.py
@@ -30,7 +30,7 @@ class Config(object):
     KNOWLEDGEBASE_KEY = os.environ.get("KNOWLEDGEBASE_KEY", "secret-key")
     DEPLOY_ENV = os.environ.get("DEPLOY_ENV", "development")
     SPARC_APP_HOST = os.environ.get("SPARC_APP_HOST", "https://sparc-app.herokuapp.com")
-    SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_dev")
+    SCI_CRUNCH_HOST = os.environ.get("SCICRUNCH_HOST", "https://scicrunch.org/api/1/elastic/SPARC_PortalDatasets_stage")
     MAPSTATE_TABLENAME = os.environ.get("MAPSTATE_TABLENAME", "mapstates")
     SCAFFOLDSTATE_TABLENAME = os.environ.get("SCAFFOLDSTATE_TABLENAME", "scaffoldstates")
     WRIKE_TOKEN = os.environ.get("WRIKE_TOKEN")
diff --git a/app/main.py b/app/main.py
index d1637ac..8c0e3ab 100644
--- a/app/main.py
+++ b/app/main.py
@@ -349,7 +349,7 @@ def presign_resource_url():
 
 # This version of s3-resouces is used for accessing files on staging. Use it as a replacement for 's3-resource'
 # No changes are need on the front end, just use s3-resource as normal
-@app.route("/s3-resource/<path:path>")
+# @app.route("/s3-resource/<path:path>")
 def direct_download_url(path):
     print(path)
     filePath = path.split('files/')[-1]
@@ -377,6 +377,33 @@ def direct_download_url(path):
         return resp2.content
     return jsonify({'error': 'error with the provided ID '}, status=502)
 
+# This version of s3-resouces is used for accessing files on staging that have never been published
+@app.route("/s3-resource/<path:path>")
+def direct_download_url2(path):
+    print(path)
+    filePath = path.split('files/')[-1]
+    pennsieveId = path.split('/')[0]
+
+    # If length is small, we have a pennsieve discover id. We will process this one with the normal s3-resource route
+    if len(pennsieveId) <= 4:
+        return direct_download_url(path)
+
+    if 'N:package:' not in pennsieveId:
+        pennsieveId = 'N:dataset:' + pennsieveId
+
+    url = bfWorker.getURLFromDatasetIdAndFilePath(pennsieveId, filePath)
+    if url != None:
+        resp2 = requests.get(url)
+        return resp2.content
+    return jsonify({'error': 'error with the provided ID '}, status=502)
+
+
+@app.route("/proxy/")
+def proxy():
+    url = request.args.get('url')
+    resp = requests.get(url)
+    return resp.content
+    return jsonify({'error': 'error with the provided ID '}, status=502)
 
 @app.route("/scicrunch-dataset/<doi1>/<doi2>")
 def sci_doi(doi1, doi2):

From 05da2131f9d1d57cf0c1ab807ce6199dc89951e4 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Fri, 9 Dec 2022 13:24:57 +1300
Subject: [PATCH 07/13] Add pennsieve login script to staging

---
 app/main.py          |  8 ++++++++
 scripts/pennsieve.py | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 scripts/pennsieve.py

diff --git a/app/main.py b/app/main.py
index 0a6a4d2..b80b18b 100644
--- a/app/main.py
+++ b/app/main.py
@@ -4,6 +4,7 @@
 import json
 import logging
 import requests
+from flask import make_response
 
 from apscheduler.schedulers.background import BackgroundScheduler
 from botocore.exceptions import ClientError
@@ -32,6 +33,7 @@
 from app.osparc import run_simulation
 from app.biolucida_process_results import process_results as process_biolucida_results
 from app.bfworker import BFWorker
+from scripts.pennsieve import pennsieve_login, get_banner
 
 app = Flask(__name__)
 # set environment variable
@@ -251,6 +253,12 @@ def create_presigned_url(expiration=3600):
 
     return create_s3_presigned_url(key, content_type, expiration)
 
+@app.route("/get_banner/<datasetId>")
+def get_banner_pen(datasetId):
+    p_temp_key = pennsieve_login()
+    ban = get_banner(p_temp_key, datasetId)
+    return ban
+
 
 @app.route("/thumbnail/neurolucida")
 def thumbnail_from_neurolucida_file():
diff --git a/scripts/pennsieve.py b/scripts/pennsieve.py
new file mode 100644
index 0000000..801e01e
--- /dev/null
+++ b/scripts/pennsieve.py
@@ -0,0 +1,41 @@
+import logging
+import boto3
+from app.config import Config
+import requests
+import json
+
+
+
+# Returns pennsieve api token valid for 24 hours
+
+def pennsieve_login():
+    r = requests.get(f"{Config.PENNSIEVE_API_HOST}/authentication/cognito-config")
+    r.raise_for_status()
+
+    cognito_app_client_id = r.json()["tokenPool"]["appClientId"]
+    cognito_region = r.json()["region"]
+
+    cognito_idp_client = boto3.client(
+        "cognito-idp",
+        region_name=cognito_region,
+        aws_access_key_id="",
+        aws_secret_access_key="",
+    )
+
+    login_response = cognito_idp_client.initiate_auth(
+        AuthFlow="USER_PASSWORD_AUTH",
+        AuthParameters={"USERNAME": Config.PENNSIEVE_API_TOKEN, "PASSWORD": Config.PENNSIEVE_API_SECRET},
+        ClientId=cognito_app_client_id,
+    )
+
+    api_key = login_response["AuthenticationResult"]["AccessToken"]
+    return api_key
+
+
+
+def get_banner(pennsieve_temp_api_key, dataset_id):
+    print(f"{Config.PENNSIEVE_API_HOST}/datasets/N%3Adataset%3A{dataset_id}/banner?api_key={pennsieve_temp_api_key}")
+    r = requests.get(f"{Config.PENNSIEVE_API_HOST}/datasets/N%3Adataset%3A{dataset_id}/banner",
+                     headers={"Authorization": f"Bearer {pennsieve_temp_api_key}"})
+    r.raise_for_status()
+    return r.json()

From 86a6318450a3c036a604261d4b6dfb54ae7db5c7 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Mon, 15 May 2023 11:37:05 +1200
Subject: [PATCH 08/13] add pennsieve id endpoint

---
 app/main.py               | 12 +++++++++++-
 app/scicrunch_requests.py | 16 ++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/app/main.py b/app/main.py
index b80b18b..4217627 100644
--- a/app/main.py
+++ b/app/main.py
@@ -19,7 +19,7 @@
 
 from app.scicrunch_requests import create_doi_query, create_filter_request, create_facet_query, create_doi_aggregate, create_title_query, \
     create_identifier_query, create_pennsieve_identifier_query, create_field_query, create_request_body_for_curies, create_onto_term_query, \
-    create_multiple_doi_query, create_multiple_discoverId_query
+    create_multiple_doi_query, create_multiple_discoverId_query, create_pennsieve_id_query
 from scripts.email_sender import EmailSender, feedback_email, resource_submission_confirmation_email, creation_request_confirmation_email, issue_reporting_email, community_spotlight_submit_form_email, news_and_events_submit_form_email
 from threading import Lock
 from xml.etree import ElementTree
@@ -563,6 +563,16 @@ def getFileUrlFromPennsieve(discoverId):
         return jsonify({'url': url})
     return jsonify({'error': 'error with the provided ID '}, status=502)
 
+    
+@app.route("/dataset_info/using_pennsieveId")
+@app.route("/dataset_info/using_pennsieveId/")
+def get_dataset_info_pennsieve_id():
+    ids = request.args.get('ids')
+    query = create_pennsieve_id_query(ids)
+
+    return process_results(dataset_search(query))
+
+
 @app.route("/dataset_info/using_title")
 def get_dataset_info_title():
     title = request.args.get('title')
diff --git a/app/scicrunch_requests.py b/app/scicrunch_requests.py
index 16749a9..fdb790a 100644
--- a/app/scicrunch_requests.py
+++ b/app/scicrunch_requests.py
@@ -161,6 +161,22 @@ def create_doi_request(doi):
     return query
 
 
+def create_pennsieve_id_query(pennseiveId):
+    query = {
+        "size": 50,
+        "from": 0,
+        "query": {
+            "term": {
+                "item.identifier.aggregate": {
+                    "value": f"N:dataset:{pennseiveId}"
+                }
+            }
+        }
+    }
+
+    print(query)
+    return query
+
 # create_facet_query(type): Generates facet search request data for sci-crunch  given a 'type'; where
 # 'type' is either 'species', 'gender', or 'organ' at this stage.
 #  Returns a tuple of the type-map and request data ( type_map, data )

From 51ff6c75acab6e11ba8812e30e7b86d8435b57ef Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Fri, 19 May 2023 14:23:48 +1200
Subject: [PATCH 09/13] Pick id for scicrunch based on what is available

---
 app/main.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/app/main.py b/app/main.py
index 4217627..ddb387f 100644
--- a/app/main.py
+++ b/app/main.py
@@ -568,7 +568,11 @@ def getFileUrlFromPennsieve(discoverId):
 @app.route("/dataset_info/using_pennsieveId/")
 def get_dataset_info_pennsieve_id():
     ids = request.args.get('ids')
-    query = create_pennsieve_id_query(ids)
+
+    if len(ids) > 4:
+        query = create_pennsieve_id_query(ids)
+    else:
+        query = create_multiple_discoverId_query([ids])
 
     return process_results(dataset_search(query))
 

From 84be0e305cb8e1b85ee760ebc4d2fb6b4d1e92b9 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Fri, 9 Jun 2023 13:48:17 +1200
Subject: [PATCH 10/13] Fix pennsieve REST api login not working

---
 app/config.py        | 2 +-
 scripts/pennsieve.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/config.py b/app/config.py
index 0fc706c..0382093 100644
--- a/app/config.py
+++ b/app/config.py
@@ -4,7 +4,7 @@
 
 
 class Config(object):
-    PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST")
+    PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST", "https://api.pennsieve.io")
     PENNSIEVE_API_SECRET = os.environ.get("PENNSIEVE_API_SECRET", "local-secret-key")
     PENNSIEVE_API_TOKEN = os.environ.get("PENNSIEVE_API_TOKEN", "local-api-key")
     PENNSIEVE_EMBARGO_TEAM_ID = os.environ.get("PENNSIEVE_EMBARGO_TEAM_ID")
diff --git a/scripts/pennsieve.py b/scripts/pennsieve.py
index 801e01e..6b1b5fb 100644
--- a/scripts/pennsieve.py
+++ b/scripts/pennsieve.py
@@ -18,8 +18,8 @@ def pennsieve_login():
     cognito_idp_client = boto3.client(
         "cognito-idp",
         region_name=cognito_region,
-        aws_access_key_id="",
-        aws_secret_access_key="",
+        aws_access_key_id=Config.SPARC_PORTAL_AWS_KEY,
+        aws_secret_access_key=Config.SPARC_PORTAL_AWS_SECRET,
     )
 
     login_response = cognito_idp_client.initiate_auth(

From 681f2bc9b47aceb1c67bb9cd4f49e4862981938f Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Tue, 22 Aug 2023 13:50:48 +1200
Subject: [PATCH 11/13] Add keywords to pass through list

---
 app/scicrunch_processing_v_1_1_5.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/app/scicrunch_processing_v_1_1_5.py b/app/scicrunch_processing_v_1_1_5.py
index 342e0e5..5dda5de 100644
--- a/app/scicrunch_processing_v_1_1_5.py
+++ b/app/scicrunch_processing_v_1_1_5.py
@@ -5,7 +5,7 @@
 from app.scicrunch_processing_common import PASS_THROUGH_KEYS as BASE_PASS_THROUGH_KEYS
 from app.manifest_name_to_discover_name import name_map
 
-PASS_THROUGH_KEYS = ["doi", "dataset_identifier", "dataset_version", "dataset_revision", *BASE_PASS_THROUGH_KEYS]
+PASS_THROUGH_KEYS = ["doi", "dataset_identifier", "dataset_version", "dataset_revision", "keywords", *BASE_PASS_THROUGH_KEYS]
 
 
 # attributes is used to map desired parameters onto the path of keys needed in the sci-crunch response.
@@ -17,6 +17,7 @@
     'sampleSize': ['item', 'statistics', 'samples', 'count'],
     'subjectSize': ['item', 'statistics', 'subjects', 'count'],
     'name': ['item', 'name'],
+    'keywords': ['item', 'keywords'],
     'description': ['item', 'description'],
     'identifier': ['item', 'identifier'],
     'uri': ['distributions', 'current', 'uri'],

From f653a6768152b17f3382f799e5714854f9a2079e Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Wed, 20 Sep 2023 11:46:04 +1200
Subject: [PATCH 12/13] Fix issue in while loop stepping through dataset
 folders

---
 app/bfworker.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/app/bfworker.py b/app/bfworker.py
index de876b6..fc2ee52 100644
--- a/app/bfworker.py
+++ b/app/bfworker.py
@@ -52,18 +52,19 @@ def getURLFromDatasetIdAndFilePath(self, datasetId, filePath):
       fileArray = filePath.split('/')
       items = self.bf.get_dataset(datasetId).items
       count = 0
+      depth = 0
       while type(items) is list:
           item = items[count]
-          for fileName in fileArray:
-              if fileName == item.name:
-                  if type(item) is pennsieve.Collection:
-                      items = item.items
-                      count = -1
-                      continue
-                  else:
-                      try:
-                          return item.files[0].url
-                      except:
-                          return None
+          if fileArray[depth] == item.name:
+              if type(item) is pennsieve.Collection:
+                  items = item.items
+                  count = -1
+                  depth += 1
+                  continue
+              else:
+                  try:
+                      return item.files[0].url
+                  except:
+                      return None
           count += 1
       return None

From 2ffc3ca3926274fce7e164f9f6bb4657fb6dd714 Mon Sep 17 00:00:00 2001
From: Jesse Khorasanee <jessekhorasanee@gmail.com>
Date: Wed, 20 Sep 2023 12:57:45 +1200
Subject: [PATCH 13/13] Now ignore dataset and versions when searching folders

---
 app/bfworker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app/bfworker.py b/app/bfworker.py
index fc2ee52..aa590f9 100644
--- a/app/bfworker.py
+++ b/app/bfworker.py
@@ -50,6 +50,7 @@ def getImagefromPackageId(self, packageId):
 
   def getURLFromDatasetIdAndFilePath(self, datasetId, filePath):
       fileArray = filePath.split('/')
+      fileArray = list(filter(lambda f: not f.isdigit(), fileArray))
       items = self.bf.get_dataset(datasetId).items
       count = 0
       depth = 0