sennetconsortium · maxsibilla · Nov 6, 2023 · Oct 18, 2023 · Oct 19, 2023 · Oct 19, 2023
@@ -99,3 +99,6 @@ DATACITE_SENNET_PREFIX = ''
 UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
 UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
 UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020", "assay_types":{"code": "C004000", "key": "datasets", "endpoint": "datasets?application_context=SENNET"}}'
+
+# CEDAR API KEY, get one at: https://cedar.metadatacenter.org/
+CEDAR_API_KEY = ''
@@ -0,0 +1,49 @@
+# Validating metadata 
+
+## Validate using form data
+`POST /metadata/validate`
+
+### Payload (Form Data):
+```
+metadata: (binary) # this is the TSV upload
+entity_type: Source
+sub_type: murine
+```
+
+### Sample response:
+The response will contain the `metadata` to be stored in db, and the `pathname` which can be used for reference and revalidation purposes
+```
+{ code: 200
+metadata: [{bedding: "Aspen chip", cage_enhancements: "Nestlets",…}]
+pathname: "cr46sq7pbn594v2btqst/example_source_mouse_metadata.tsv"}
+```
+
+## Validate using json
+Can actually pass a pathname to file. This is useful for revalidating a tsv file and comparing its metadata response to another.
+Actually done in entity-api to verify that the posted `metadata` from the portal-ui is valid.
+### Payload (JSON):
+```
+{pathname: "cr46sq7pbn594v2btqst/example_source_mouse_metadata.tsv",
+entity_type: Source,
+sub_type: murine}
+```
+
+## Verify a certain TSV row
+If want to validate a certain row on file, pass `tsv_row`
+### Payload (JSON):
+```
+{pathname: "cr46sq7pbn594v2btqst/example_source_mouse_metadata.tsv",
+tsv_row: 3,
+entity_type: Source,
+sub_type: murine,}
+```
+
+## Failed Response
+Failed responses will return status of `406 Not Acceptable`.
+```
+{code:406,
+description: [0:"Unexpected fields: {'area_value', 'section_thickness_unit', 'section_thickness_value', 'area_unit', 'histological_report', 'section_index_number'}"
+1:"Missing fields: {'suspension_enriched_target', 'suspension_entity_number', 'suspension_entity', 'suspension_enriched'}"
+2:"In column 13, found \"histological_report\", expected \"suspension_entity\"",…],
+name:"Unacceptable Metadata"}
+```
@@ -17,10 +17,20 @@
 import time
 import csv
 
+from ..privs import get_groups_token
+
 validation_blueprint = Blueprint('validation', __name__)
 logger = logging.getLogger(__name__)
 
 
+"""
+Checks the uploaded file
+
+Returns
+-------
+dict
+    A dictionary of containing upload details or an 'error' key if something went wrong
+"""
 def check_metadata_upload():
     result: dict = {
         'error': None
@@ -38,6 +48,14 @@ def check_metadata_upload():
     return result
 
 
+"""
+Creates a dictionary of file and path details
+
+Returns
+-------
+dict
+    A dictionary containing the filename and fullpath details
+"""
 def set_file_details(pathname):
     base_path = get_base_path()
     return {
@@ -46,11 +64,39 @@ def set_file_details(pathname):
     }
 
 
+"""
+Parses a tsv and returns the rows of that tsv
+
+Parameters
+----------
+path : str
+    The path where the tsv file is stored
+
+Returns
+-------
+list
+    A list of dictionaries
+"""
 def get_metadata(path):
     result = get_csv_records(path)
     return result.get('records')
 
 
+"""
+Calls methods of the Ingest Validation Tools submodule
+
+Parameters
+----------
+schema : str
+    Name of the schema to validate against
+path : str
+    The path of the tsv for Ingest Validation Tools
+
+Returns
+-------
+dict
+    A dictionary containing validation results
+"""
 def validate_tsv(schema='metadata', path=None):
     try:
         schema_name = (
@@ -61,12 +107,33 @@ def validate_tsv(schema='metadata', path=None):
         result = {'Preflight': str(e)}
     else:
         try:
-            result = iv_utils.get_tsv_errors(path, schema_name=schema_name, report_type=table_validator.ReportType.JSON)
+            app_context = {
+                'request_header': {'X-SenNet-Application': 'ingest-api'},
+                'entities_url': f"{commons_file_helper.ensureTrailingSlashURL(current_app.config['ENTITY_WEBSERVICE_URL'])}entities/"
+            }
+            result = iv_utils.get_tsv_errors(path, schema_name=schema_name, report_type=table_validator.ReportType.JSON,
+                                             cedar_api_key=current_app.config['CEDAR_API_KEY'], globus_token=get_groups_token(), app_context=app_context)
         except Exception as e:
             result = rest_server_err(e, True)
     return json.dumps(result)
 
 
+"""
+Creates a tsv from path of a specific row.
+This is in order to validate only one if necessary.
+
+Parameters
+----------
+path : str
+    Path of original tsv
+row : int
+    Row number in tsv to extract for new tsv
+
+Returns
+-------
+dict
+    A dictionary containing file details
+"""
 def create_tsv_from_path(path, row):
 
     result: dict = {
@@ -85,6 +152,21 @@ def create_tsv_from_path(path, row):
 
     return result
 
+def get_cedar_schema_ids():
+    return {
+        'Block': '3e98cee6-d3fb-467b-8d4e-9ba7ee49eeff',
+        'Section': '01e9bc58-bdf2-49f4-9cf9-dd34f3cc62d7',
+        'Suspension': 'ea4fb93c-508e-4ec4-8a4b-89492ba68088'
+    }
+
+
+def check_cedar(entity_type, sub_type, upload):
+    records = get_metadata(upload.get('fullpath'))
+    if len(records) > 0:
+        if equals(entity_type, Ontology.ops().entities().SAMPLE) and 'metadata_schema_id' in records[0]:
+            cedar_sample_sub_type_ids = get_cedar_schema_ids()
+            return equals(records[0]['metadata_schema_id'], cedar_sample_sub_type_ids[sub_type])
+    return True
 
 def determine_schema(entity_type, sub_type):
     if equals(entity_type, Ontology.ops().entities().SOURCE):
@@ -112,7 +194,19 @@ def _get_response(metadata, entity_type, sub_type, validate_uuids, pathname=None
 
     return response
 
-
+"""
+Returns the tsv id column name for the given entity type
+
+Parameters
+----------
+entity_type : str
+    The entity type
+
+Returns
+-------
+str
+    The name of the column in the tsv
+"""
 def get_col_id_name_by_entity_type(entity_type):
     if equals(entity_type, Ontology.ops().entities().SAMPLE):
         return 'sample_id'
@@ -136,29 +230,55 @@ def supported_metadata_sub_types(entity_type):
             Ontology.ops().specimen_categories().SECTION,
             Ontology.ops().specimen_categories().SUSPENSION]
 
+"""
+Validates the uuids / SenNet ids of given records.
+This is used for bulk upload so that ancestor ids referenced by the user in TSVs
+are found to actually exist, are supported and confirm to entity constraints.
+
+Parameters
+----------
+records : list
+    The set of records to validate
+entity_type : str
+    The entity type
+sub_type : str
+    The sub type of the entity
+pathname : str
+    The pathname of the tsv. 
+    (This is always returned in the response for tracking and other re-validation purposes.)
+
+Returns
+-------
+Response
+    Rest response containing results of validation
+"""
 def validate_records_uuids(records, entity_type, sub_type, pathname):
     errors = []
     passing = []
     header = get_auth_header()
     ok = True
     idx = 1
     for r in records:
+        # First get the id column name, in order to get SenNet id in the record
         id_col = get_col_id_name_by_entity_type(entity_type)
         entity_id = r.get(id_col)
+        # Use the SenNet id to find the stored entity
         url = commons_file_helper.ensureTrailingSlashURL(current_app.config['ENTITY_WEBSERVICE_URL']) + 'entities/' + entity_id
         resp = requests.get(url, headers=header)
         if resp.status_code < 300:
             entity = resp.json()
             if sub_type is not None:
                 sub_type_col = get_sub_type_name_by_entity_type(entity_type)
                 _sub_type = entity.get(sub_type_col)
+                # Check that the stored entity _sub_type is actually supported for validation
                 if _sub_type not in supported_metadata_sub_types(entity_type):
                     ok = False
                     errors.append(rest_response(StatusCodes.UNACCEPTABLE, StatusMsgs.UNACCEPTABLE,
                                                 ln_err(f"of `{to_title_case(_sub_type)}` unsupported "
                                                        f"on check of given `{entity_id}`. "
                                                        f"Supported `{'`, `'.join(supported_metadata_sub_types(entity_type))}`.",
                                                        idx, sub_type_col), dict_only=True))
+                # Check that the stored entity _sub_type matches what is expected (the type being bulk uploaded)
                 elif not equals(sub_type, _sub_type):
                     ok = False
                     errors.append(rest_response(StatusCodes.UNACCEPTABLE, StatusMsgs.UNACCEPTABLE,
@@ -211,6 +331,12 @@ def validate_metadata_upload():
         response = error
 
         if error is None:
+            if check_cedar(entity_type, sub_type, upload) is False:
+                id = get_cedar_schema_ids().get(sub_type)
+                return rest_response(StatusCodes.UNACCEPTABLE, 'Unacceptable Metadata',
+                                     f"Mismatch of \"{entity_type} {sub_type}\" and \"metadata_schema_id\". Valid id for \"{sub_type}\": {id}. "
+                                     f"For more details, check out the docs: https://docs.sennetconsortium.org/libraries/ingest-validation-tools/schemas")
+
             schema = determine_schema(entity_type, sub_type)
             validation_results = validate_tsv(path=upload.get('fullpath'), schema=schema)
             if len(validation_results) > 2: