Skip to content

Commit

Permalink
Merge pull request #596 from hubmapconsortium/Derek-Furst/prov-info-d…
Browse files Browse the repository at this point in the history
…ataset-type

modified prov-info and <id>/prov-info to use dataset_type instead of …
  • Loading branch information
yuanzhou authored Jan 17, 2024
2 parents 2502e76 + 22a8337 commit 56899dc
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 68 deletions.
65 changes: 8 additions & 57 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2674,7 +2674,6 @@ def get_prov_info():
HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
HEADER_DATASET_LAB_ID = 'lab_id_or_name'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
Expand Down Expand Up @@ -2702,12 +2701,11 @@ def get_prov_info():
HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'

# TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
headers = [
HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
Expand All @@ -2721,13 +2719,12 @@ def get_prov_info():
# Token is not required, but if an invalid token is provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

organ_types_dict = schema_manager.get_organ_types()
if user_in_hubmap_read_group(request):
published_only = False

# Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
# because that would require using a urllib request for each dataset
organ_types_dict = schema_manager.get_organ_types()

# As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
# having to access the resource for every dataset.
Expand Down Expand Up @@ -2797,29 +2794,7 @@ def get_prov_info():
internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']

# Data type codes are replaced with data type descriptions
assay_description_list = []
# TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
for item in dataset['data_types']:
try:
assay_description_list.append(assay_types_dict[item]['description'])
except KeyError:
logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")

# Just use the data type value
assay_description_list.append(item)

dataset['data_types'] = assay_description_list
internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']

# If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv
if return_json is False:
internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.

internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']
internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace('<identifier>', dataset['uuid'])

# first_sample properties are retrieved from its own dictionary
Expand Down Expand Up @@ -2860,7 +2835,7 @@ def get_prov_info():
distinct_organ_uuid_list.append(item['uuid'])

organ_code = item['organ'].upper()
validate_organ_code(organ_code)
validate_organ_code(organ_code, organ_types_dict)

distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
Expand Down Expand Up @@ -3016,7 +2991,7 @@ def get_prov_info_for_dataset(id):
# Token is not required, but if an invalid token provided,
# we need to tell the client with a 401 error
validate_token_if_auth_header_exists(request)

organ_types_dict = schema_manager.get_organ_types()
# Use the internal token to query the target entity
# since public entities don't require user token
token = get_internal_token()
Expand Down Expand Up @@ -3055,7 +3030,6 @@ def get_prov_info_for_dataset(id):
HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
HEADER_DATASET_LAB_ID = 'lab_id_or_name'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
HEADER_DATASET_SAMPLES = 'dataset_samples'
Expand Down Expand Up @@ -3083,12 +3057,11 @@ def get_prov_info_for_dataset(id):
HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status'
HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'

# TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
headers = [
HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
Expand All @@ -3100,7 +3073,6 @@ def get_prov_info_for_dataset(id):

# Parsing the organ types yaml has to be done here rather than calling schema.schema_triggers.get_organ_description
# because that would require using a urllib request for each dataset
organ_types_dict = schema_manager.get_organ_types()

# As above, we parse te assay type yaml here rather than calling the special method for it because this avoids
# having to access the resource for every dataset.
Expand All @@ -3124,28 +3096,7 @@ def get_prov_info_for_dataset(id):
internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0)))
internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email']
internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id']

# Data type codes are replaced with data type descriptions
assay_description_list = []
# TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
for item in dataset['data_types']:
try:
assay_description_list.append(assay_types_dict[item]['description'])
except KeyError:
logger.exception(f"Data type {item} not found in resulting assay types via ontology-api")

# Just use the data type value
assay_description_list.append(item)

dataset['data_types'] = assay_description_list
internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
if return_json is False:
internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.

internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type']
internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type']

internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace(
'<identifier>', dataset['uuid'])
Expand Down Expand Up @@ -3185,7 +3136,7 @@ def get_prov_info_for_dataset(id):
distinct_organ_uuid_list.append(item['uuid'])

organ_code = item['organ'].upper()
validate_organ_code(organ_code)
validate_organ_code(organ_code, organ_types_dict )

distinct_organ_type_list.append(organ_types_dict[organ_code].lower())
internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list
Expand Down
15 changes: 4 additions & 11 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET, REVISIONS")

logger.info("======get_prov_info() query======")
logger.info(query)
Expand Down Expand Up @@ -782,10 +782,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
record_dict['last_modified_timestamp'] = record_contents[11]
record_dict['last_modified_user_email'] = record_contents[12]
record_dict['lab_dataset_id'] = record_contents[13]
data_types = record_contents[14]
data_types = data_types.replace("'", '"')
data_types = json.loads(data_types)
record_dict['data_types'] = data_types
record_dict['dataset_dataset_type'] = record_contents[14]
content_fifteen = []
for entry in record_contents[15]:
node_dict = schema_neo4j_queries.node_to_dict(entry)
Expand Down Expand Up @@ -834,7 +831,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type")
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET")
logger.info("======get_prov_info() query======")
logger.info(query)

Expand Down Expand Up @@ -877,10 +874,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
record_dict['last_modified_timestamp'] = record_contents[11]
record_dict['last_modified_user_email'] = record_contents[12]
record_dict['lab_dataset_id'] = record_contents[13]
data_types = record_contents[14]
data_types = data_types.replace("'", '"')
data_types = json.loads(data_types)
record_dict['data_types'] = data_types
record_dict['dataset_dataset_type'] = record_contents[14]
content_fifteen = []
for entry in record_contents[15]:
node_dict = schema_neo4j_queries.node_to_dict(entry)
Expand All @@ -891,7 +885,6 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
node_dict = schema_neo4j_queries.node_to_dict(entry)
content_sixteen.append(node_dict)
record_dict['processed_dataset'] = content_sixteen
record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else ''
return record_dict


Expand Down

0 comments on commit 56899dc

Please sign in to comment.