Skip to content

Commit 3f2a059

Browse files
authored
Merge pull request #232 from CBIIT/CRDCDH-2370-001-pgu
Crdcdh 2370
2 parents 1726126 + a899c89 commit 3f2a059

File tree

3 files changed

+103
-34
lines changed

3 files changed

+103
-34
lines changed

configs/messages_configuration.yml

+5
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,11 @@ Messages:
130130
severity: "Error"
131131
message: '"properties" is not defined in the model.'
132132
description: "Data model missing properties definitions."
133+
M027:
134+
title: "CDE not available"
135+
severity: "Error"
136+
message: '{} CDE for property "{}" is not available. Please try again later and contact the helpdesk if this error persists.'
137+
description: "CDE version doesn't exist in DB, and on-demand pull failed."
133138

134139
# data file validation message
135140
F001:

src/metadata_validator.py

+27-25
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python3
2-
import pandas as pd
32
import json
43
from datetime import datetime
54
from bento.common.sqs import VisibilityExtender
@@ -18,10 +17,11 @@
1817
from common.model_reader import valid_prop_types
1918
from service.ecs_agent import set_scale_in_protection
2019
from x_submission_validator import CrossSubmissionValidator
21-
from pv_puller import get_pv_by_code_version
20+
from pv_puller import get_pv_by_datacommon_version_cde
2221

2322
VISIBILITY_TIMEOUT = 20
2423
BATCH_SIZE = 1000
24+
CDE_NOT_FOUND = "CDE not available"
2525

2626
def metadataValidate(configs, job_queue, mongo_dao):
2727
log = get_logger('Metadata Validation Service')
@@ -93,12 +93,6 @@ def metadataValidate(configs, job_queue, mongo_dao):
9393
except KeyboardInterrupt:
9494
log.info('Good bye!')
9595
return
96-
97-
98-
""" Requirement for the ticket crdcdh-343
99-
For metadata: validate data folder contains TSV or TXT files
100-
Compose a list of files to be updated and their sizes (metadata or files)
101-
"""
10296
class MetaDataValidator:
10397

10498
def __init__(self, mongo_dao, model_store, config):
@@ -112,6 +106,8 @@ def __init__(self, mongo_dao, model_store, config):
112106
self.submission = None
113107
self.isError = None
114108
self.isWarning = None
109+
self.searched_sts = False
110+
self.not_found_cde = False
115111

116112

117113
def validate(self, submission_id, scope):
@@ -487,7 +483,9 @@ def validate_prop_value(self, prop_name, value, prop_def, msg_prefix):
487483
val = None
488484
minimum = prop_def.get(MIN)
489485
maximum = prop_def.get(MAX)
490-
permissive_vals = self.get_permissive_value(prop_def)
486+
permissive_vals, msg = self.get_permissive_value(prop_def)
487+
if msg and msg == CDE_NOT_FOUND:
488+
errors.append(create_error("M027", [msg_prefix, prop_name], prop_name, value))
491489
if type == "string":
492490
val = str(value)
493491
result, error = check_permissive(val, permissive_vals, msg_prefix, prop_name, self.mongo_dao)
@@ -520,12 +518,6 @@ def validate_prop_value(self, prop_name, value, prop_def, msg_prefix):
520518
if len(errs) > 0:
521519
errors.extend(errs)
522520

523-
# elif type == "datetime":
524-
# try:
525-
# val = datetime.strptime(value, DATETIME_FORMAT)
526-
# except ValueError as e:
527-
# errors.append(create_error("Invalid datetime value", f'{msg_prefix} Property "{prop_name}": "{value}" is not a valid datetime type.'))
528-
529521
elif type == "date" or type == "datetime":
530522
val = None
531523
for date_format in DATE_FORMATS:
@@ -562,6 +554,7 @@ def validate_prop_value(self, prop_name, value, prop_def, msg_prefix):
562554
"""
563555
def get_permissive_value(self, prop_def):
564556
permissive_vals = prop_def.get("permissible_values")
557+
msg = None
565558
if prop_def.get(CDE_TERM) and len(prop_def.get(CDE_TERM)) > 0:
566559
# retrieve permissible values from DB or cde site
567560
cde_code = None
@@ -580,19 +573,28 @@ def get_permissive_value(self, prop_def):
580573
else:
581574
permissive_vals = None
582575
else:
583-
# call pv_puller to get permissible values from caDSR
584-
cde, msg = get_pv_by_code_version(self.config, self.log, self.datacommon, prop_def["name"], cde_code, cde_version)
585-
if cde:
586-
if cde.get(CDE_PERMISSIVE_VALUES) is not None:
587-
if len(cde[CDE_PERMISSIVE_VALUES]) > 0:
588-
permissive_vals = cde[CDE_PERMISSIVE_VALUES]
589-
else:
590-
permissive_vals = None #escape validation
591-
self.mongo_dao.insert_cde([cde])
576+
if not self.searched_sts:
577+
cde = get_pv_by_datacommon_version_cde(self.config[TIER_CONFIG], self.submission[DATA_COMMON_NAME],
578+
self.submission[MODEL_VERSION], cde_code, cde_version, self.log, self.mongo_dao)
579+
self.searched_sts = True
580+
if cde:
581+
if cde.get(CDE_PERMISSIVE_VALUES) is not None:
582+
if len(cde[CDE_PERMISSIVE_VALUES]) > 0:
583+
permissive_vals = cde[CDE_PERMISSIVE_VALUES]
584+
else:
585+
permissive_vals = None #escape validation
586+
else:
587+
msg = CDE_NOT_FOUND
588+
self.not_found_cde = True
589+
else:
590+
if self.not_found_cde:
591+
msg = CDE_NOT_FOUND
592+
593+
592594
# strip white space if the value is string
593595
if permissive_vals and len(permissive_vals) > 0 and isinstance(permissive_vals[0], str):
594596
permissive_vals = [item.strip() for item in permissive_vals]
595-
return permissive_vals
597+
return permissive_vals, msg
596598

597599
"""util functions"""
598600
def check_permissive(value, permissive_vals, msg_prefix, prop_name, dao):

src/pv_puller.py

+71-9
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
FILE_DOWNLOAD_URL = "download_url"
1313
FILE_NAME = "name"
1414
FILE_TYPE = "type"
15+
CDE_PV_NAME = "permissibleValues"
16+
STS_FILE_URL = "https://raw.githubusercontent.com/CBIIT/crdc-datahub-terms/{}/{}_{}_sts.json"
1517

1618
def pull_pv_lists(configs, mongo_dao):
1719
"""
@@ -94,13 +96,9 @@ def extract_cde_from_file(self, download_url, cde_set, cde_record):
9496
if cde_key in cde_set:
9597
continue
9698
cde_set.add(cde_key)
97-
cde_long_name = cde.get(CDE_FULL_NAME)
98-
cde_record.append({
99-
CDE_FULL_NAME: cde_long_name,
100-
CDE_CODE: cde_code,
101-
CDE_VERSION: cde_version,
102-
CDE_PERMISSIVE_VALUES: extract_pv_list(cde.get('permissibleValues'))
103-
})
99+
cde_record.append(
100+
compose_cde_record(cde)
101+
)
104102

105103
except Exception as e:
106104
self.log.exception(e)
@@ -174,6 +172,70 @@ def get_pv_by_code_version(configs, log, data_common, prop_name, cde_code, cde_v
174172
UPDATED_AT: current_datetime(),
175173
}, msg
176174

175+
def get_pv_by_datacommon_version_cde(tier, data_commons, data_model_version, cde_code, cde_version, log, mongo_dao):
176+
"""
177+
extracts the CDE list by data_commons, data_model_version and finds matches based on the provided CDE code and version,
178+
finally saves the CDE list into DB.
179+
180+
:param tier: The tier of the environment (e.g., 'dev', 'qa', 'prod').
181+
:param data_commons: The name of the data commons.
182+
:param data_model_version: The version of the data model.
183+
:param cde_code: The code of the CDE to retrieve.
184+
:param cde_version: The version of the CDE to retrieve.
185+
:param log: Logger instance for logging information and errors.
186+
:param mongo_dao: Data access object for MongoDB operations.
187+
:returns: A dictionary containing the CDE full name, code, version, and permissive values, or None if the CDE is not found or an error occurs.
188+
"""
189+
# construct the sts dump file url based on tier, data commons, and model version
190+
sts_file_url = STS_FILE_URL.format(tier, data_commons.lower(), data_model_version)
191+
try:
192+
log.info(f"Extracting cde from {sts_file_url}")
193+
api_client = APIInvoker({})
194+
result = api_client.get_synonyms(sts_file_url)
195+
if not result or len(result) == 0:
196+
log.error(f"CDE dump file,{sts_file_url} is not found! ")
197+
return None
198+
cde_list = [item for item in result if item.get(CDE_CODE) and item.get(CDE_CODE) != 'null']
199+
if not cde_list or len(cde_list) == 0:
200+
log.error(f"No cde found in {sts_file_url}")
201+
return
202+
203+
cde_set = set()
204+
cde_records = []
205+
return_cde = None
206+
for item in cde_list:
207+
code = item.get(CDE_CODE)
208+
version = item.get(CDE_VERSION) if item.get(CDE_VERSION) and item.get(CDE_VERSION) != 'null' else None
209+
cde_key = (code, version)
210+
if cde_key in cde_set:
211+
continue
212+
cde_set.add(cde_key)
213+
cde_record = compose_cde_record(item)
214+
cde_records.append(
215+
cde_record
216+
)
217+
if code == cde_code and version == cde_version:
218+
return_cde = cde_record
219+
# save all extracted CDEs
220+
mongo_dao.upsert_cde(cde_records)
221+
# return matched CDE
222+
return return_cde
223+
except Exception as e:
224+
log.exception(e)
225+
log.exception(f"Failed to extract cde from {sts_file_url}")
226+
return None
227+
228+
def compose_cde_record(cde_item):
229+
"""
230+
compose cde record from cde dump file
231+
"""
232+
cde_record = {
233+
CDE_FULL_NAME: cde_item.get(CDE_FULL_NAME),
234+
CDE_CODE: cde_item.get(CDE_CODE),
235+
CDE_VERSION: cde_item.get(CDE_VERSION) if cde_item.get(CDE_VERSION) and cde_item.get(CDE_VERSION) != 'null' else None,
236+
CDE_PERMISSIVE_VALUES: extract_pv_list(cde_item.get(CDE_PV_NAME))
237+
}
238+
return cde_record
177239

178240
class SynonymPuller:
179241
"""
@@ -238,9 +300,9 @@ def get_synonyms_by_datacommon_version(self, synonym_url, synonym_set):
238300
self.log.info(f"Synonyms for {synonym_url} are not found! ")
239301
return None
240302
# filter out empty synonyms
241-
synonyms = [item for item in result if item.get('permissibleValues') and item.get('permissibleValues')[0].get('synonyms')]
303+
synonyms = [item for item in result if item.get(CDE_PV_NAME) and item.get(CDE_PV_NAME)[0].get('synonyms')]
242304
for item in synonyms:
243-
pv_list = item.get('permissibleValues')
305+
pv_list = item.get(CDE_PV_NAME)
244306
if pv_list:
245307
for pv in pv_list:
246308
value = pv.get('value')

0 commit comments

Comments
 (0)