Merge pull request #162 from ypriverol/patch-01

Major changes and bug fixing
bigbio · Feb 28, 2024 · b78c2b5 · b78c2b5
2 parents 01e73d8 + f5a449c
commit b78c2b5
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 40 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -5,9 +5,9 @@ name: Continuous integration Unit tests
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   PythonBlack:

diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -5,9 +5,9 @@ name: Python application
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   build:

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -5,9 +5,9 @@ name: Python package
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   build:

diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.24"
+__version__ = "0.0.25"
diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -67,12 +67,12 @@ def ontology_term_parser(cell_value: str = None):
 
 class SDRFColumn(Column):
     def __init__(
-        self,
-        name: str,
-        validations: typing.Iterable["_BaseValidation"] = None,
-        optional_validations: typing.Iterable["_BaseValidation"] = None,
-        allow_empty=False,
-        optional_type=True,
+            self,
+            name: str,
+            validations: typing.Iterable["_BaseValidation"] = None,
+            optional_validations: typing.Iterable["_BaseValidation"] = None,
+            allow_empty=False,
+            optional_type=True,
     ):
         if validations is None:
             validations = []
@@ -145,8 +145,8 @@ def validate(self, series: pd.Series) -> pd.Series:
 
             if ontology_terms is not None:
                 query_labels = [o["label"].lower() for o in ontology_terms]
-                for label in query_labels:
-                    labels.append(label)
+                if term[TERM_NAME] in query_labels:
+                    labels.append(term[TERM_NAME])
         if self._not_available:
             labels.append(NOT_AVAILABLE)
         if self._not_applicable:
@@ -179,6 +179,10 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
             )
             errors.append(LogicError(error_message, error_type=logging.WARN))
 
+        empty_cells_errors = self.validate_empty_cells(panda_sdrf)
+        if empty_cells_errors:
+            errors.extend(empty_cells_errors)
+
         # Check the mandatory fields
         error_mandatory = self.validate_mandatory_columns(panda_sdrf)
         if error_mandatory is not None:
@@ -218,9 +222,9 @@ def validate_column_names(self, panda_sdrf):
                 errors.append(cname)
             elif m.group().startswith("factor value"):
                 if (
-                    m.group().replace("factor value", "comment") not in panda_sdrf.columns
-                    and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns
-                    and m.group() not in panda_sdrf.columns
+                        m.group().replace("factor value", "comment") not in panda_sdrf.columns
+                        and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns
+                        and m.group() not in panda_sdrf.columns
                 ):
                     error_message = "The " + cname + " column should also be in the characteristics or comment"
                     logerror.append(LogicError(error_message, error_type=logging.ERROR))
@@ -260,7 +264,7 @@ def validate_columns_order(panda_sdrf):
                     error_message = "The column " + column + "cannot be before the assay name"
                     error_columns_order.append(LogicError(error_message, error_type=logging.ERROR))
                 if (
-                    "characteristics" in column or ("material type" in column and "factor value" not in column)
+                        "characteristics" in column or ("material type" in column and "factor value" not in column)
                 ) and cnames.index(column) > index:
                     error_message = "The column " + column + "cannot be after the assay name"
                     error_columns_order.append(LogicError(error_message, error_type=logging.ERROR))
@@ -310,6 +314,29 @@ def check_recommendations(self, panda_sdrf):
             warnings += column.validate_optional(series)
         return sorted(warnings, key=lambda e: e.row)
 
+    def validate_empty_cells(self, panda_sdrf):
+        """
+        Check for empty cells in the SDRF. This method will return a list of errors if any empty cell is found.
+        :param panda_sdrf: SDRF dataframe
+        :return: List of errors
+        """
+        errors = []
+
+        def validate_string(cell_value):
+            return cell_value is not None and cell_value != "nan" and len(cell_value.strip()) > 0
+
+        # Apply the validation function element-wise
+        validation_results = panda_sdrf.map(validate_string)
+
+        # Get the indices where the validation fails
+        failed_indices = [(row, col) for row in validation_results.index for col in validation_results.columns if
+                          not validation_results.at[row, col]]
+
+        for row, col in failed_indices:
+            message = f"Empty value found Row: {row}, Column: {col}"
+            errors.append(LogicError(message, error_type=logging.ERROR))
+        return errors
+
 
 default_schema = SDRFSchema(
     [

diff --git a/sdrf_pipelines/zooma/ols.py b/sdrf_pipelines/zooma/ols.py
@@ -105,14 +105,17 @@ def get_ancestors(self, ont, iri):
             raise ex
 
     def search(
-        self,
-        name,
-        query_fields=None,
-        ontology=None,
-        field_list=None,
-        children_of=None,
-        exact=None,
-        bytype="class",
+            self,
+            name: str,
+            query_fields=None,
+            ontology: str=None,
+            field_list=None,
+            children_of=None,
+            exact: bool=None,
+            bytype: str="class",
+            rows: int=10,
+            num_retries:int=10,
+            start: int=0,
     ):
         """
         Searches the OLS with the given term
@@ -124,6 +127,8 @@ def search(
         @:param exact: Forces exact match if not `None`
         @:param bytype: restrict to terms one of {class,property,individual,ontology}
         @:param childrenOf: Search only under a certain term.
+        @:param rows: number of rows to query on each call of OLS search
+        @:param num_retries: Number of retries to OLS when it fails.
         """
         params = {"q": name}
         if ontology is not None:
@@ -135,6 +140,9 @@ def search(
         if bytype:
             params["type"] = _concat_str_or_list(bytype)
 
+        if rows:
+            params["rows"] = rows
+
         if ontology:
             params["ontology"] = _concat_str_or_list(ontology)
         elif self.ontology:
@@ -155,26 +163,53 @@ def search(
         if len(children_of) > 0:
             params["childrenOf"] = _concat_str_or_list(children_of)
 
-        retry_num = 0
+        if start:
+            params["start"] = start
+
+        docs_found = []
 
-        while retry_num < 10:
+        for retry_num in range(num_retries):
             try:
                 req = self.session.get(self.ontology_search, params=params)
-                logger.debug("Request to OLS search API: %s - %s", req.status_code, name)
+                logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code)
 
-                req.raise_for_status()
-                if req.json()["response"]["numFound"]:
-                    return req.json()["response"]["docs"]
-                if exact:
-                    logger.debug("OLS exact search returned empty response for %s", name)
+                if req.status_code != 200:
+                    logger.error("OLS search term %s error tried number %s", name, retry_num)
+                    req.raise_for_status()
                 else:
-                    logger.debug("OLS search returned empty response for %s", name)
-                return None
+                    if req.json()["response"]["numFound"] == 0:
+                        if exact:
+                            logger.debug("OLS exact search returned empty response for %s", name)
+                        else:
+                            logger.debug("OLS search returned empty response for %s", name)
+                        return docs_found
+                    elif len(req.json()["response"]["docs"]) < rows:
+                        return req.json()["response"]["docs"]
+                    else:
+                        docs_found = req.json()["response"]["docs"]
+                        docs_found.extend(self.search(name, query_fields=query_fields, ontology=ontology,
+                                                      field_list=field_list, children_of=children_of, exact=exact,
+                                                      bytype=bytype, rows=rows, num_retries=num_retries,
+                                                      start=(rows + (start))))
+                        return docs_found
+
+                if req.status_code == 200 and req.json()["response"]["numFound"] == 0:
+                    if exact:
+                        logger.debug("OLS exact search returned empty response for %s", name)
+                    else:
+                        logger.debug("OLS search returned empty response for %s", name)
+                    return None
+                elif req.status_code != 200 and req.json()["response"]["numFound"] > 0:
+                    if len(req.json()["response"]["docs"]) <= rows:
+                        return req.json()["response"]["docs"]
+                    else:
+                        start = 0
+                        docs_found = req.json()["response"]["docs"]
+
             except Exception as ex:
-                retry_num += 1
-                logger.debug("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex)
+                logger.exception("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex)
 
-        return None
+        return docs_found
 
     def suggest(self, name, ontology=None):
         """Suggest terms from an optional list of ontologies