diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 090334d0..ab2afc52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,9 +5,9 @@ name: Continuous integration Unit tests on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: PythonBlack: diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 38c78498..bd87c20d 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -5,9 +5,9 @@ name: Python application on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: build: diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index d32ecdb7..5fb3c658 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,9 +5,9 @@ name: Python package on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: build: diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py index 5681085f..5a6b518d 100644 --- a/sdrf_pipelines/__init__.py +++ b/sdrf_pipelines/__init__.py @@ -1 +1 @@ -__version__ = "0.0.24" +__version__ = "0.0.25" diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index 2c3cf8bf..34b53717 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -67,12 +67,12 @@ def ontology_term_parser(cell_value: str = None): class SDRFColumn(Column): def __init__( - self, - name: str, - validations: typing.Iterable["_BaseValidation"] = None, - optional_validations: typing.Iterable["_BaseValidation"] = None, - allow_empty=False, - optional_type=True, + self, + name: str, + validations: typing.Iterable["_BaseValidation"] = None, + optional_validations: typing.Iterable["_BaseValidation"] = None, + allow_empty=False, + optional_type=True, ): if validations is None: validations = [] @@ -145,8 +145,8 @@ def validate(self, series: pd.Series) -> pd.Series: if ontology_terms is not None: query_labels = [o["label"].lower() for o in ontology_terms] - for label in query_labels: - labels.append(label) + if term[TERM_NAME] in query_labels: + labels.append(term[TERM_NAME]) if self._not_available: labels.append(NOT_AVAILABLE) if self._not_applicable: @@ -179,6 +179,10 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]: ) errors.append(LogicError(error_message, error_type=logging.WARN)) + empty_cells_errors = self.validate_empty_cells(panda_sdrf) + if empty_cells_errors: + errors.extend(empty_cells_errors) + # Check the mandatory fields error_mandatory = self.validate_mandatory_columns(panda_sdrf) if error_mandatory is not None: @@ -218,9 +222,9 @@ def validate_column_names(self, panda_sdrf): errors.append(cname) elif m.group().startswith("factor value"): if ( - m.group().replace("factor value", "comment") not in panda_sdrf.columns - and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns - and m.group() not in panda_sdrf.columns + m.group().replace("factor value", "comment") not in panda_sdrf.columns + and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns + and m.group() not in panda_sdrf.columns ): error_message = "The " + cname + " column should also be in the characteristics or comment" logerror.append(LogicError(error_message, error_type=logging.ERROR)) @@ -260,7 +264,7 @@ def validate_columns_order(panda_sdrf): error_message = "The column " + column + "cannot be before the assay name" error_columns_order.append(LogicError(error_message, error_type=logging.ERROR)) if ( - "characteristics" in column or ("material type" in column and "factor value" not in column) + "characteristics" in column or ("material type" in column and "factor value" not in column) ) and cnames.index(column) > index: error_message = "The column " + column + "cannot be after the assay name" error_columns_order.append(LogicError(error_message, error_type=logging.ERROR)) @@ -310,6 +314,29 @@ def check_recommendations(self, panda_sdrf): warnings += column.validate_optional(series) return sorted(warnings, key=lambda e: e.row) + def validate_empty_cells(self, panda_sdrf): + """ + Check for empty cells in the SDRF. This method will return a list of errors if any empty cell is found. + :param panda_sdrf: SDRF dataframe + :return: List of errors + """ + errors = [] + + def validate_string(cell_value): + return cell_value is not None and cell_value != "nan" and len(cell_value.strip()) > 0 + + # Apply the validation function element-wise + validation_results = panda_sdrf.map(validate_string) + + # Get the indices where the validation fails + failed_indices = [(row, col) for row in validation_results.index for col in validation_results.columns if + not validation_results.at[row, col]] + + for row, col in failed_indices: + message = f"Empty value found Row: {row}, Column: {col}" + errors.append(LogicError(message, error_type=logging.ERROR)) + return errors + default_schema = SDRFSchema( [ diff --git a/sdrf_pipelines/zooma/ols.py b/sdrf_pipelines/zooma/ols.py index c06084ab..ab9a80cb 100644 --- a/sdrf_pipelines/zooma/ols.py +++ b/sdrf_pipelines/zooma/ols.py @@ -105,14 +105,17 @@ def get_ancestors(self, ont, iri): raise ex def search( - self, - name, - query_fields=None, - ontology=None, - field_list=None, - children_of=None, - exact=None, - bytype="class", + self, + name: str, + query_fields=None, + ontology: str=None, + field_list=None, + children_of=None, + exact: bool=None, + bytype: str="class", + rows: int=10, + num_retries:int=10, + start: int=0, ): """ Searches the OLS with the given term @@ -124,6 +127,8 @@ def search( @:param exact: Forces exact match if not `None` @:param bytype: restrict to terms one of {class,property,individual,ontology} @:param childrenOf: Search only under a certain term. + @:param rows: number of rows to query on each call of OLS search + @:param num_retries: Number of retries to OLS when it fails. """ params = {"q": name} if ontology is not None: @@ -135,6 +140,9 @@ def search( if bytype: params["type"] = _concat_str_or_list(bytype) + if rows: + params["rows"] = rows + if ontology: params["ontology"] = _concat_str_or_list(ontology) elif self.ontology: @@ -155,26 +163,53 @@ def search( if len(children_of) > 0: params["childrenOf"] = _concat_str_or_list(children_of) - retry_num = 0 + if start: + params["start"] = start + + docs_found = [] - while retry_num < 10: + for retry_num in range(num_retries): try: req = self.session.get(self.ontology_search, params=params) - logger.debug("Request to OLS search API: %s - %s", req.status_code, name) + logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code) - req.raise_for_status() - if req.json()["response"]["numFound"]: - return req.json()["response"]["docs"] - if exact: - logger.debug("OLS exact search returned empty response for %s", name) + if req.status_code != 200: + logger.error("OLS search term %s error tried number %s", name, retry_num) + req.raise_for_status() else: - logger.debug("OLS search returned empty response for %s", name) - return None + if req.json()["response"]["numFound"] == 0: + if exact: + logger.debug("OLS exact search returned empty response for %s", name) + else: + logger.debug("OLS search returned empty response for %s", name) + return docs_found + elif len(req.json()["response"]["docs"]) < rows: + return req.json()["response"]["docs"] + else: + docs_found = req.json()["response"]["docs"] + docs_found.extend(self.search(name, query_fields=query_fields, ontology=ontology, + field_list=field_list, children_of=children_of, exact=exact, + bytype=bytype, rows=rows, num_retries=num_retries, + start=(rows + (start)))) + return docs_found + + if req.status_code == 200 and req.json()["response"]["numFound"] == 0: + if exact: + logger.debug("OLS exact search returned empty response for %s", name) + else: + logger.debug("OLS search returned empty response for %s", name) + return None + elif req.status_code != 200 and req.json()["response"]["numFound"] > 0: + if len(req.json()["response"]["docs"]) <= rows: + return req.json()["response"]["docs"] + else: + start = 0 + docs_found = req.json()["response"]["docs"] + except Exception as ex: - retry_num += 1 - logger.debug("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex) + logger.exception("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex) - return None + return docs_found def suggest(self, name, ontology=None): """Suggest terms from an optional list of ontologies