Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Derek furst/fix put trigger #607

Merged
merged 4 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 17 additions & 83 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2674,7 +2674,6 @@ def get_prov_info():
HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
HEADER_DATASET_LAB_ID = 'lab_id_or_name'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
Expand Down Expand Up @@ -2702,12 +2701,11 @@ def get_prov_info():
HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'

# TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
headers = [
HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
Expand All @@ -2721,13 +2719,12 @@ def get_prov_info():
# Token is not required, but if an invalid token is provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

organ_types_dict = schema_manager.get_organ_types()
if user_in_hubmap_read_group(request):
published_only = False

# Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
# because that would require using a urllib request for each dataset
organ_types_dict = schema_manager.get_organ_types()

# As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
# having to access the resource for every dataset.
Expand Down Expand Up @@ -2797,29 +2794,7 @@ def get_prov_info():
internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']

# Data type codes are replaced with data type descriptions
assay_description_list = []
# TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
for item in dataset['data_types']:
try:
assay_description_list.append(assay_types_dict[item]['description'])
except KeyError:
logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")

# Just use the data type value
assay_description_list.append(item)

dataset['data_types'] = assay_description_list
internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']

# If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv
if return_json is False:
internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.

internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']
internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace('<identifier>', dataset['uuid'])

# first_sample properties are retrieved from its own dictionary
Expand Down Expand Up @@ -2860,7 +2835,7 @@ def get_prov_info():
distinct_organ_uuid_list.append(item['uuid'])

organ_code = item['organ'].upper()
validate_organ_code(organ_code)
validate_organ_code(organ_code, organ_types_dict)

distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
Expand Down Expand Up @@ -3016,7 +2991,7 @@ def get_prov_info_for_dataset(id):
# Token is not required, but if an invalid token provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

organ_types_dict = schema_manager.get_organ_types()
# Use the internal token to query the target entity
# since public entities don't require user token
token = get_internal_token()
Expand Down Expand Up @@ -3055,7 +3030,6 @@ def get_prov_info_for_dataset(id):
HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
HEADER_DATASET_LAB_ID = 'lab_id_or_name'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
HEADER_DATASET_SAMPLES = 'dataset_samples'
Expand Down Expand Up @@ -3083,12 +3057,11 @@ def get_prov_info_for_dataset(id):
HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status'
HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'

# TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
headers = [
HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
Expand All @@ -3100,7 +3073,6 @@ def get_prov_info_for_dataset(id):

# Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
# because that would require using a urllib request for each dataset
organ_types_dict = schema_manager.get_organ_types()

# As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
# having to access the resource for every dataset.
Expand All @@ -3124,28 +3096,7 @@ def get_prov_info_for_dataset(id):
internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']

# Data type codes are replaced with data type descriptions
assay_description_list = []
# TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
for item in dataset['data_types']:
try:
assay_description_list.append(assay_types_dict[item]['description'])
except KeyError:
logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")

# Just use the data type value
assay_description_list.append(item)

dataset['data_types'] = assay_description_list
internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
if return_json is False:
internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.

internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type']
internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']

internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace(
'<identifier>', dataset['uuid'])
Expand Down Expand Up @@ -3185,7 +3136,7 @@ def get_prov_info_for_dataset(id):
distinct_organ_uuid_list.append(item['uuid'])

organ_code = item['organ'].upper()
validate_organ_code(organ_code)
validate_organ_code(organ_code, organ_types_dict )

distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
Expand Down Expand Up @@ -3326,12 +3277,9 @@ def sankey_data():
# String constants
HEADER_DATASET_GROUP_NAME = 'dataset_group_name'
HEADER_ORGAN_TYPE = 'organ_type'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_STATUS = 'dataset_status'

with open('sankey_mapping.json') as f:
mapping_dict = json.load(f)
# Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
# because that would require using a urllib request for each dataset
organ_types_dict = schema_manager.get_organ_types()
Expand Down Expand Up @@ -3359,41 +3307,27 @@ def sankey_data():
internal_dict = collections.OrderedDict()
internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME]

# TODO BEGIN evaluate elimination of this block once dataset['dataset_type'] is required and dataset['data_types'] removed.
organ_code = dataset[HEADER_ORGAN_TYPE].upper()
validate_organ_code(organ_code)
validate_organ_code(organ_code, organ_types_dict)

internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[organ_code].lower()
# Data type codes are replaced with data type descriptions
assay_description = ""
try:
assay_description = assay_types_dict[dataset[HEADER_DATASET_DATA_TYPES]]['description']
except KeyError:
logger.exception(f"Data type {dataset[HEADER_DATASET_DATA_TYPES]} not found in resulting assay types via ontology-api")

# Just use the data type value
assay_description = dataset[HEADER_DATASET_DATA_TYPES]

internal_dict[HEADER_DATASET_DATA_TYPES] = assay_description
internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset[HEADER_DATASET_DATASET_TYPE]

# Replace applicable Group Name and Data type with the value needed for the sankey via the mapping_dict
internal_dict[HEADER_DATASET_STATUS] = dataset['dataset_status']
if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys():
internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]]
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
# if internal_dict[HEADER_DATASET_GROUP_NAME] in mapping_dict.keys():
# internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]

# Each dataset's dictionary is added to the list to be returned
dataset_sankey_list.append(internal_dict)

if MEMCACHED_MODE:
# Cache the result
memcached_client_instance.set(cache_key, dataset_sankey_list, expire = SchemaConstants.MEMCACHED_TTL)
else:
logger.info(f'Using the cached sankey data at time {datetime.now()}')

return jsonify(dataset_sankey_list)


Expand Down Expand Up @@ -4807,13 +4741,13 @@ def access_level_prefix_dir(dir_name):
----------
organ_code : str
"""
def validate_organ_code(organ_code):
def validate_organ_code(organ_code, organ_types_dict=None):
if organ_types_dict is None:
organ_types_dict = schema_manager.get_organ_types()
if not organ_code.isalpha() or not len(organ_code) == 2:
internal_server_error(f"Invalid organ code {organ_code}. Must be 2-letter alphabetic code")

try:
organ_types_dict = schema_manager.get_organ_types()

if organ_code.upper() not in organ_types_dict:
not_found_error(f"Unable to find organ code {organ_code} via the ontology-api")
except requests.exceptions.RequestException:
Expand Down
26 changes: 6 additions & 20 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET, REVISIONS")

logger.info("======get_prov_info() query======")
logger.info(query)
Expand Down Expand Up @@ -782,10 +782,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
record_dict['last_modified_timestamp'] = record_contents[11]
record_dict['last_modified_user_email'] = record_contents[12]
record_dict['lab_dataset_id'] = record_contents[13]
data_types = record_contents[14]
data_types = data_types.replace("'", '"')
data_types = json.loads(data_types)
record_dict['data_types'] = data_types
record_dict['dataset_dataset_type'] = record_contents[14]
content_fifteen = []
for entry in record_contents[15]:
node_dict = schema_neo4j_queries.node_to_dict(entry)
Expand Down Expand Up @@ -834,7 +831,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type")
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET")
logger.info("======get_prov_info() query======")
logger.info(query)

Expand Down Expand Up @@ -877,10 +874,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
record_dict['last_modified_timestamp'] = record_contents[11]
record_dict['last_modified_user_email'] = record_contents[12]
record_dict['lab_dataset_id'] = record_contents[13]
data_types = record_contents[14]
data_types = data_types.replace("'", '"')
data_types = json.loads(data_types)
record_dict['data_types'] = data_types
record_dict['dataset_dataset_type'] = record_contents[14]
content_fifteen = []
for entry in record_contents[15]:
node_dict = schema_neo4j_queries.node_to_dict(entry)
Expand All @@ -891,7 +885,6 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
node_dict = schema_neo4j_queries.node_to_dict(entry)
content_sixteen.append(node_dict)
record_dict['processed_dataset'] = content_sixteen
record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else ''
return record_dict


Expand Down Expand Up @@ -942,7 +935,7 @@ def get_sankey_info(neo4j_driver):
query = (f"MATCH (ds:Dataset)<-[]-(a)<-[]-(:Sample)"
# specimen_type -> sample_category 12/15/2022
f"MATCH (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{sample_category:'organ'}})-[*]->(ds)"
f"RETURN distinct ds.group_name, organ.organ, ds.data_types, ds.status, ds. uuid order by ds.group_name")
f"RETURN distinct ds.group_name, organ.organ, ds.dataset_type, ds.status, ds. uuid order by ds.group_name")
logger.info("======get_sankey_info() query======")
logger.info(query)
with neo4j_driver.session() as session:
Expand All @@ -958,14 +951,7 @@ def get_sankey_info(neo4j_driver):
record_contents.append(item)
record_dict['dataset_group_name'] = record_contents[0]
record_dict['organ_type'] = record_contents[1]
data_types_list = record_contents[2]
data_types_list = data_types_list.replace("'", '"')
data_types_list = json.loads(data_types_list)
data_types = data_types_list[0]
if (len(data_types_list)) > 1:
if (data_types_list[0] == "scRNAseq-10xGenomics-v3" and data_types_list[1] == "snATACseq") or (data_types_list[1] == "scRNAseq-10xGenomics-v3" and data_types_list[0] == "snATACseq"):
data_types = "scRNA-seq (10x Genomics v3),snATAC-seq"
record_dict['dataset_data_types'] = data_types
record_dict['dataset_dataset_type'] = record_contents[2]
record_dict['dataset_status'] = record_contents[3]
list_of_dictionaries.append(record_dict)
return list_of_dictionaries
Expand Down
Loading