From 90bab1074cb0e321241a4067719607d05252bbb0 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 8 Jan 2024 18:54:16 +0000
Subject: [PATCH 01/10] Update ci.yml

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 090334d0..ab2afc52 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,9 +5,9 @@ name: Continuous integration Unit tests
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   PythonBlack:

From 958df91b5edf7f2eab1e5175caf0d613f26105c9 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 8 Jan 2024 18:54:32 +0000
Subject: [PATCH 02/10] Update pythonapp.yml

---
 .github/workflows/pythonapp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
index 38c78498..bd87c20d 100644
--- a/.github/workflows/pythonapp.yml
+++ b/.github/workflows/pythonapp.yml
@@ -5,9 +5,9 @@ name: Python application
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   build:

From b20df9d870fc27f95c18692a0ed88c0610255f09 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Mon, 8 Jan 2024 18:54:46 +0000
Subject: [PATCH 03/10] Update pythonpackage.yml

---
 .github/workflows/pythonpackage.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
index d32ecdb7..5fb3c658 100644
--- a/.github/workflows/pythonpackage.yml
+++ b/.github/workflows/pythonpackage.yml
@@ -5,9 +5,9 @@ name: Python package
 
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 jobs:
   build:

From 2149cf460232f8c0faad64d5113ef778c0c4d227 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Tue, 27 Feb 2024 14:47:40 +0000
Subject: [PATCH 04/10] implement recursive in search OLS

---
 sdrf_pipelines/sdrf/sdrf_schema.py |  2 +
 sdrf_pipelines/zooma/ols.py        | 77 ++++++++++++++++++++++--------
 2 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
index 2c3cf8bf..69e37d42 100644
--- a/sdrf_pipelines/sdrf/sdrf_schema.py
+++ b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -135,6 +135,8 @@ def validate(self, series: pd.Series) -> pd.Series:
         terms = [ontology_term_parser(x) for x in series.unique()]
         labels = []
         for term in terms:
+            if term['NT'] == 'clostridium perfringens':
+                print(term)
             if TERM_NAME not in term:
                 ontology_terms = None
             else:
diff --git a/sdrf_pipelines/zooma/ols.py b/sdrf_pipelines/zooma/ols.py
index c06084ab..ab9a80cb 100644
--- a/sdrf_pipelines/zooma/ols.py
+++ b/sdrf_pipelines/zooma/ols.py
@@ -105,14 +105,17 @@ def get_ancestors(self, ont, iri):
             raise ex
 
     def search(
-        self,
-        name,
-        query_fields=None,
-        ontology=None,
-        field_list=None,
-        children_of=None,
-        exact=None,
-        bytype="class",
+            self,
+            name: str,
+            query_fields=None,
+            ontology: str=None,
+            field_list=None,
+            children_of=None,
+            exact: bool=None,
+            bytype: str="class",
+            rows: int=10,
+            num_retries:int=10,
+            start: int=0,
     ):
         """
         Searches the OLS with the given term
@@ -124,6 +127,8 @@ def search(
         @:param exact: Forces exact match if not `None`
         @:param bytype: restrict to terms one of {class,property,individual,ontology}
         @:param childrenOf: Search only under a certain term.
+        @:param rows: number of rows to query on each call of OLS search
+        @:param num_retries: Number of retries to OLS when it fails.
         """
         params = {"q": name}
         if ontology is not None:
@@ -135,6 +140,9 @@ def search(
         if bytype:
             params["type"] = _concat_str_or_list(bytype)
 
+        if rows:
+            params["rows"] = rows
+
         if ontology:
             params["ontology"] = _concat_str_or_list(ontology)
         elif self.ontology:
@@ -155,26 +163,53 @@ def search(
         if len(children_of) > 0:
             params["childrenOf"] = _concat_str_or_list(children_of)
 
-        retry_num = 0
+        if start:
+            params["start"] = start
+
+        docs_found = []
 
-        while retry_num < 10:
+        for retry_num in range(num_retries):
             try:
                 req = self.session.get(self.ontology_search, params=params)
-                logger.debug("Request to OLS search API: %s - %s", req.status_code, name)
+                logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code)
 
-                req.raise_for_status()
-                if req.json()["response"]["numFound"]:
-                    return req.json()["response"]["docs"]
-                if exact:
-                    logger.debug("OLS exact search returned empty response for %s", name)
+                if req.status_code != 200:
+                    logger.error("OLS search term %s error tried number %s", name, retry_num)
+                    req.raise_for_status()
                 else:
-                    logger.debug("OLS search returned empty response for %s", name)
-                return None
+                    if req.json()["response"]["numFound"] == 0:
+                        if exact:
+                            logger.debug("OLS exact search returned empty response for %s", name)
+                        else:
+                            logger.debug("OLS search returned empty response for %s", name)
+                        return docs_found
+                    elif len(req.json()["response"]["docs"]) < rows:
+                        return req.json()["response"]["docs"]
+                    else:
+                        docs_found = req.json()["response"]["docs"]
+                        docs_found.extend(self.search(name, query_fields=query_fields, ontology=ontology,
+                                                      field_list=field_list, children_of=children_of, exact=exact,
+                                                      bytype=bytype, rows=rows, num_retries=num_retries,
+                                                      start=(rows + (start))))
+                        return docs_found
+
+                if req.status_code == 200 and req.json()["response"]["numFound"] == 0:
+                    if exact:
+                        logger.debug("OLS exact search returned empty response for %s", name)
+                    else:
+                        logger.debug("OLS search returned empty response for %s", name)
+                    return None
+                elif req.status_code != 200 and req.json()["response"]["numFound"] > 0:
+                    if len(req.json()["response"]["docs"]) <= rows:
+                        return req.json()["response"]["docs"]
+                    else:
+                        start = 0
+                        docs_found = req.json()["response"]["docs"]
+
             except Exception as ex:
-                retry_num += 1
-                logger.debug("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex)
+                logger.exception("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex)
 
-        return None
+        return docs_found
 
     def suggest(self, name, ontology=None):
         """Suggest terms from an optional list of ontologies

From 2fd27a77856ce30f7de087cb05e3be4e4340dfd4 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Tue, 27 Feb 2024 16:05:12 +0000
Subject: [PATCH 05/10] check empty cells in sdrf

---
 sdrf_pipelines/sdrf/sdrf_schema.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
index 69e37d42..d79a599b 100644
--- a/sdrf_pipelines/sdrf/sdrf_schema.py
+++ b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -3,6 +3,7 @@
 import typing
 from typing import Any
 
+import numpy as np
 import pandas as pd
 from pandas_schema import Column
 from pandas_schema import Schema
@@ -135,8 +136,6 @@ def validate(self, series: pd.Series) -> pd.Series:
         terms = [ontology_term_parser(x) for x in series.unique()]
         labels = []
         for term in terms:
-            if term['NT'] == 'clostridium perfringens':
-                print(term)
             if TERM_NAME not in term:
                 ontology_terms = None
             else:
@@ -181,6 +180,10 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
             )
             errors.append(LogicError(error_message, error_type=logging.WARN))
 
+        empty_cells_errors = self.validate_empty_cells(panda_sdrf)
+        if empty_cells_errors:
+            errors.extend(empty_cells_errors)
+
         # Check the mandatory fields
         error_mandatory = self.validate_mandatory_columns(panda_sdrf)
         if error_mandatory is not None:
@@ -312,6 +315,28 @@ def check_recommendations(self, panda_sdrf):
             warnings += column.validate_optional(series)
         return sorted(warnings, key=lambda e: e.row)
 
+    def validate_empty_cells(self, panda_sdrf):
+        """
+        Check for empty cells in the SDRF. This method will return a list of errors if any empty cell is found.
+        :param panda_sdrf: SDRF dataframe
+        :return: List of errors
+        """
+        errors = []
+        def validate_string(string):
+            return len(string.strip()) > 0
+
+        # Apply the validation function element-wise
+        validation_results = panda_sdrf.map(lambda x: validate_string(x))
+
+        # Get the indices where the validation fails
+        failed_indices = [(row, col) for row in validation_results.index for col in validation_results.columns if
+                          not validation_results.at[row, col]]
+
+        for row, col in failed_indices:
+            message = f"Empty value found Row: {row}, Column: {col}"
+            errors.append(LogicError(message, error_type=logging.ERROR))
+        return errors
+
 
 default_schema = SDRFSchema(
     [

From d88685bd0f9d1571e63c9af142ca618e2626d991 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Tue, 27 Feb 2024 16:10:01 +0000
Subject: [PATCH 06/10] version increased

---
 sdrf_pipelines/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py
index 5681085f..5a6b518d 100644
--- a/sdrf_pipelines/__init__.py
+++ b/sdrf_pipelines/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.24"
+__version__ = "0.0.25"

From 948c0c6b63e12c763614db57a0a8e35eb3e89c61 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Tue, 27 Feb 2024 16:38:59 +0000
Subject: [PATCH 07/10] only returning exact terms.

---
 sdrf_pipelines/sdrf/sdrf_schema.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
index d79a599b..f8a9456a 100644
--- a/sdrf_pipelines/sdrf/sdrf_schema.py
+++ b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -68,12 +68,12 @@ def ontology_term_parser(cell_value: str = None):
 
 class SDRFColumn(Column):
     def __init__(
-        self,
-        name: str,
-        validations: typing.Iterable["_BaseValidation"] = None,
-        optional_validations: typing.Iterable["_BaseValidation"] = None,
-        allow_empty=False,
-        optional_type=True,
+            self,
+            name: str,
+            validations: typing.Iterable["_BaseValidation"] = None,
+            optional_validations: typing.Iterable["_BaseValidation"] = None,
+            allow_empty=False,
+            optional_type=True,
     ):
         if validations is None:
             validations = []
@@ -146,8 +146,8 @@ def validate(self, series: pd.Series) -> pd.Series:
 
             if ontology_terms is not None:
                 query_labels = [o["label"].lower() for o in ontology_terms]
-                for label in query_labels:
-                    labels.append(label)
+                if term[TERM_NAME] in query_labels:
+                    labels.append(term[TERM_NAME])
         if self._not_available:
             labels.append(NOT_AVAILABLE)
         if self._not_applicable:
@@ -223,9 +223,9 @@ def validate_column_names(self, panda_sdrf):
                 errors.append(cname)
             elif m.group().startswith("factor value"):
                 if (
-                    m.group().replace("factor value", "comment") not in panda_sdrf.columns
-                    and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns
-                    and m.group() not in panda_sdrf.columns
+                        m.group().replace("factor value", "comment") not in panda_sdrf.columns
+                        and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns
+                        and m.group() not in panda_sdrf.columns
                 ):
                     error_message = "The " + cname + " column should also be in the characteristics or comment"
                     logerror.append(LogicError(error_message, error_type=logging.ERROR))
@@ -265,7 +265,7 @@ def validate_columns_order(panda_sdrf):
                     error_message = "The column " + column + "cannot be before the assay name"
                     error_columns_order.append(LogicError(error_message, error_type=logging.ERROR))
                 if (
-                    "characteristics" in column or ("material type" in column and "factor value" not in column)
+                        "characteristics" in column or ("material type" in column and "factor value" not in column)
                 ) and cnames.index(column) > index:
                     error_message = "The column " + column + "cannot be after the assay name"
                     error_columns_order.append(LogicError(error_message, error_type=logging.ERROR))
@@ -322,6 +322,7 @@ def validate_empty_cells(self, panda_sdrf):
         :return: List of errors
         """
         errors = []
+
         def validate_string(string):
             return len(string.strip()) > 0
 

From 3649fc251ba10bc5b882095ecb35768844c02af0 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 28 Feb 2024 06:15:30 +0000
Subject: [PATCH 08/10] only returning exact terms.

---
 sdrf_pipelines/sdrf/sdrf_schema.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
index f8a9456a..de110181 100644
--- a/sdrf_pipelines/sdrf/sdrf_schema.py
+++ b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -2,8 +2,6 @@
 import re
 import typing
 from typing import Any
-
-import numpy as np
 import pandas as pd
 from pandas_schema import Column
 from pandas_schema import Schema

From cd8bd10c5f1f1b5f5d0da02f3b367d4c3b16549a Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 28 Feb 2024 06:41:09 +0000
Subject: [PATCH 09/10] only returning exact terms.

---
 sdrf_pipelines/sdrf/sdrf_schema.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
index de110181..a2bb22b5 100644
--- a/sdrf_pipelines/sdrf/sdrf_schema.py
+++ b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -2,6 +2,8 @@
 import re
 import typing
 from typing import Any
+
+import numpy as np
 import pandas as pd
 from pandas_schema import Column
 from pandas_schema import Schema
@@ -321,11 +323,11 @@ def validate_empty_cells(self, panda_sdrf):
         """
         errors = []
 
-        def validate_string(string):
-            return len(string.strip()) > 0
+        def validate_string(cell_value):
+            return cell_value is not None and cell_value != "nan" and len(cell_value.strip()) > 0
 
         # Apply the validation function element-wise
-        validation_results = panda_sdrf.map(lambda x: validate_string(x))
+        validation_results = panda_sdrf.map(validate_string)
 
         # Get the indices where the validation fails
         failed_indices = [(row, col) for row in validation_results.index for col in validation_results.columns if

From f5a449cb5a515a61fa170c1c002b31dc12be8e2a Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 28 Feb 2024 07:18:14 +0000
Subject: [PATCH 10/10] only returning exact terms.

---
 sdrf_pipelines/sdrf/sdrf_schema.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py
index a2bb22b5..34b53717 100644
--- a/sdrf_pipelines/sdrf/sdrf_schema.py
+++ b/sdrf_pipelines/sdrf/sdrf_schema.py
@@ -3,7 +3,6 @@
 import typing
 from typing import Any
 
-import numpy as np
 import pandas as pd
 from pandas_schema import Column
 from pandas_schema import Schema