Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Libpitt/cedar support #181

Merged
merged 8 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,6 @@ DATACITE_SENNET_PREFIX = ''
UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020", "assay_types":{"code": "C004000", "key": "datasets", "endpoint": "datasets?application_context=SENNET"}}'

# CEDAR API KEY, get one at: https://cedar.metadatacenter.org/
CEDAR_API_KEY = ''
49 changes: 49 additions & 0 deletions src/routes/validation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Validating metadata

## Validate using form data
`POST /metadata/validate`

### Payload (Form Data):
```
metadata: (binary) # this is the TSV upload
entity_type: Source
sub_type: murine
```

### Sample response:
The response will contain the `metadata` to be stored in db, and the `pathname` which can be used for reference and revalidation purposes
```
{ code: 200
metadata: [{bedding: "Aspen chip", cage_enhancements: "Nestlets",…}]
pathname: "cr46sq7pbn594v2btqst/example_source_mouse_metadata.tsv"}
```

## Validate using json
Can actually pass a pathname to file. This is useful for revalidating a tsv file and comparing its metadata response to another.
Actually done in entity-api to verify that the posted `metadata` from the portal-ui is valid.
### Payload (JSON):
```
{pathname: "cr46sq7pbn594v2btqst/example_source_mouse_metadata.tsv",
entity_type: Source,
sub_type: murine}
```

## Verify a certain TSV row
If want to validate a certain row on file, pass `tsv_row`
### Payload (JSON):
```
{pathname: "cr46sq7pbn594v2btqst/example_source_mouse_metadata.tsv",
tsv_row: 3,
entity_type: Source,
sub_type: murine,}
```

## Failed Response
Failed responses will return status of `406 Not Acceptable`.
```
{code:406,
description: [0:"Unexpected fields: {'area_value', 'section_thickness_unit', 'section_thickness_value', 'area_unit', 'histological_report', 'section_index_number'}"
1:"Missing fields: {'suspension_enriched_target', 'suspension_entity_number', 'suspension_entity', 'suspension_enriched'}"
2:"In column 13, found \"histological_report\", expected \"suspension_entity\"",…],
name:"Unacceptable Metadata"}
```
130 changes: 128 additions & 2 deletions src/routes/validation/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,20 @@
import time
import csv

from ..privs import get_groups_token

validation_blueprint = Blueprint('validation', __name__)
logger = logging.getLogger(__name__)


"""
Checks the uploaded file

Returns
-------
dict
A dictionary of containing upload details or an 'error' key if something went wrong
"""
def check_metadata_upload():
result: dict = {
'error': None
Expand All @@ -38,6 +48,14 @@ def check_metadata_upload():
return result


"""
Creates a dictionary of file and path details

Returns
-------
dict
A dictionary containing the filename and fullpath details
"""
def set_file_details(pathname):
base_path = get_base_path()
return {
Expand All @@ -46,11 +64,39 @@ def set_file_details(pathname):
}


"""
Parses a tsv and returns the rows of that tsv

Parameters
----------
path : str
The path where the tsv file is stored

Returns
-------
list
A list of dictionaries
"""
def get_metadata(path):
result = get_csv_records(path)
return result.get('records')


"""
Calls methods of the Ingest Validation Tools submodule

Parameters
----------
schema : str
Name of the schema to validate against
path : str
The path of the tsv for Ingest Validation Tools

Returns
-------
dict
A dictionary containing validation results
"""
def validate_tsv(schema='metadata', path=None):
try:
schema_name = (
Expand All @@ -61,12 +107,33 @@ def validate_tsv(schema='metadata', path=None):
result = {'Preflight': str(e)}
else:
try:
result = iv_utils.get_tsv_errors(path, schema_name=schema_name, report_type=table_validator.ReportType.JSON)
app_context = {
'request_header': {'X-SenNet-Application': 'ingest-api'},
'entities_url': f"{commons_file_helper.ensureTrailingSlashURL(current_app.config['ENTITY_WEBSERVICE_URL'])}entities/"
}
result = iv_utils.get_tsv_errors(path, schema_name=schema_name, report_type=table_validator.ReportType.JSON,
cedar_api_key=current_app.config['CEDAR_API_KEY'], globus_token=get_groups_token(), app_context=app_context)
except Exception as e:
result = rest_server_err(e, True)
return json.dumps(result)


"""
Creates a tsv from path of a specific row.
This is in order to validate only one if necessary.

Parameters
----------
path : str
Path of original tsv
row : int
Row number in tsv to extract for new tsv

Returns
-------
dict
A dictionary containing file details
"""
def create_tsv_from_path(path, row):

result: dict = {
Expand All @@ -85,6 +152,21 @@ def create_tsv_from_path(path, row):

return result

def get_cedar_schema_ids():
return {
'Block': '3e98cee6-d3fb-467b-8d4e-9ba7ee49eeff',
'Section': '01e9bc58-bdf2-49f4-9cf9-dd34f3cc62d7',
'Suspension': 'ea4fb93c-508e-4ec4-8a4b-89492ba68088'
}


def check_cedar(entity_type, sub_type, upload):
records = get_metadata(upload.get('fullpath'))
if len(records) > 0:
if equals(entity_type, Ontology.ops().entities().SAMPLE) and 'metadata_schema_id' in records[0]:
cedar_sample_sub_type_ids = get_cedar_schema_ids()
return equals(records[0]['metadata_schema_id'], cedar_sample_sub_type_ids[sub_type])
return True

def determine_schema(entity_type, sub_type):
if equals(entity_type, Ontology.ops().entities().SOURCE):
Expand Down Expand Up @@ -112,7 +194,19 @@ def _get_response(metadata, entity_type, sub_type, validate_uuids, pathname=None

return response


"""
Returns the tsv id column name for the given entity type

Parameters
----------
entity_type : str
The entity type

Returns
-------
str
The name of the column in the tsv
"""
def get_col_id_name_by_entity_type(entity_type):
if equals(entity_type, Ontology.ops().entities().SAMPLE):
return 'sample_id'
Expand All @@ -136,29 +230,55 @@ def supported_metadata_sub_types(entity_type):
Ontology.ops().specimen_categories().SECTION,
Ontology.ops().specimen_categories().SUSPENSION]

"""
Validates the uuids / SenNet ids of given records.
This is used for bulk upload so that ancestor ids referenced by the user in TSVs
are found to actually exist, are supported and confirm to entity constraints.

Parameters
----------
records : list
The set of records to validate
entity_type : str
The entity type
sub_type : str
The sub type of the entity
pathname : str
The pathname of the tsv.
(This is always returned in the response for tracking and other re-validation purposes.)

Returns
-------
Response
Rest response containing results of validation
"""
def validate_records_uuids(records, entity_type, sub_type, pathname):
errors = []
passing = []
header = get_auth_header()
ok = True
idx = 1
for r in records:
# First get the id column name, in order to get SenNet id in the record
id_col = get_col_id_name_by_entity_type(entity_type)
entity_id = r.get(id_col)
# Use the SenNet id to find the stored entity
url = commons_file_helper.ensureTrailingSlashURL(current_app.config['ENTITY_WEBSERVICE_URL']) + 'entities/' + entity_id
resp = requests.get(url, headers=header)
if resp.status_code < 300:
entity = resp.json()
if sub_type is not None:
sub_type_col = get_sub_type_name_by_entity_type(entity_type)
_sub_type = entity.get(sub_type_col)
# Check that the stored entity _sub_type is actually supported for validation
if _sub_type not in supported_metadata_sub_types(entity_type):
ok = False
errors.append(rest_response(StatusCodes.UNACCEPTABLE, StatusMsgs.UNACCEPTABLE,
ln_err(f"of `{to_title_case(_sub_type)}` unsupported "
f"on check of given `{entity_id}`. "
f"Supported `{'`, `'.join(supported_metadata_sub_types(entity_type))}`.",
idx, sub_type_col), dict_only=True))
# Check that the stored entity _sub_type matches what is expected (the type being bulk uploaded)
elif not equals(sub_type, _sub_type):
ok = False
errors.append(rest_response(StatusCodes.UNACCEPTABLE, StatusMsgs.UNACCEPTABLE,
Expand Down Expand Up @@ -211,6 +331,12 @@ def validate_metadata_upload():
response = error

if error is None:
if check_cedar(entity_type, sub_type, upload) is False:
id = get_cedar_schema_ids().get(sub_type)
return rest_response(StatusCodes.UNACCEPTABLE, 'Unacceptable Metadata',
f"Mismatch of \"{entity_type} {sub_type}\" and \"metadata_schema_id\". Valid id for \"{sub_type}\": {id}. "
f"For more details, check out the docs: https://docs.sennetconsortium.org/libraries/ingest-validation-tools/schemas")

schema = determine_schema(entity_type, sub_type)
validation_results = validate_tsv(path=upload.get('fullpath'), schema=schema)
if len(validation_results) > 2:
Expand Down