From 90bab1074cb0e321241a4067719607d05252bbb0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 8 Jan 2024 18:54:16 +0000 Subject: [PATCH 01/10] Update ci.yml --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 090334d0..ab2afc52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,9 +5,9 @@ name: Continuous integration Unit tests on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: PythonBlack: From 958df91b5edf7f2eab1e5175caf0d613f26105c9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 8 Jan 2024 18:54:32 +0000 Subject: [PATCH 02/10] Update pythonapp.yml --- .github/workflows/pythonapp.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 38c78498..bd87c20d 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -5,9 +5,9 @@ name: Python application on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: build: From b20df9d870fc27f95c18692a0ed88c0610255f09 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 8 Jan 2024 18:54:46 +0000 Subject: [PATCH 03/10] Update pythonpackage.yml --- .github/workflows/pythonpackage.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index d32ecdb7..5fb3c658 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,9 +5,9 @@ name: Python package on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: build: From 2149cf460232f8c0faad64d5113ef778c0c4d227 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 27 Feb 2024 14:47:40 +0000 Subject: [PATCH 04/10] implement recursive in search OLS --- sdrf_pipelines/sdrf/sdrf_schema.py | 2 + sdrf_pipelines/zooma/ols.py | 77 ++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 21 deletions(-) diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index 2c3cf8bf..69e37d42 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -135,6 +135,8 @@ def validate(self, series: pd.Series) -> pd.Series: terms = [ontology_term_parser(x) for x in series.unique()] labels = [] for term in terms: + if term['NT'] == 'clostridium perfringens': + print(term) if TERM_NAME not in term: ontology_terms = None else: diff --git a/sdrf_pipelines/zooma/ols.py b/sdrf_pipelines/zooma/ols.py index c06084ab..ab9a80cb 100644 --- a/sdrf_pipelines/zooma/ols.py +++ b/sdrf_pipelines/zooma/ols.py @@ -105,14 +105,17 @@ def get_ancestors(self, ont, iri): raise ex def search( - self, - name, - query_fields=None, - ontology=None, - field_list=None, - children_of=None, - exact=None, - bytype="class", + self, + name: str, + query_fields=None, + ontology: str=None, + field_list=None, + children_of=None, + exact: bool=None, + bytype: str="class", + rows: int=10, + num_retries:int=10, + start: int=0, ): """ Searches the OLS with the given term @@ -124,6 +127,8 @@ def search( @:param exact: Forces exact match if not `None` @:param bytype: restrict to terms one of {class,property,individual,ontology} @:param childrenOf: Search only under a certain term. + @:param rows: number of rows to query on each call of OLS search + @:param num_retries: Number of retries to OLS when it fails. """ params = {"q": name} if ontology is not None: @@ -135,6 +140,9 @@ def search( if bytype: params["type"] = _concat_str_or_list(bytype) + if rows: + params["rows"] = rows + if ontology: params["ontology"] = _concat_str_or_list(ontology) elif self.ontology: @@ -155,26 +163,53 @@ def search( if len(children_of) > 0: params["childrenOf"] = _concat_str_or_list(children_of) - retry_num = 0 + if start: + params["start"] = start + + docs_found = [] - while retry_num < 10: + for retry_num in range(num_retries): try: req = self.session.get(self.ontology_search, params=params) - logger.debug("Request to OLS search API: %s - %s", req.status_code, name) + logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code) - req.raise_for_status() - if req.json()["response"]["numFound"]: - return req.json()["response"]["docs"] - if exact: - logger.debug("OLS exact search returned empty response for %s", name) + if req.status_code != 200: + logger.error("OLS search term %s error tried number %s", name, retry_num) + req.raise_for_status() else: - logger.debug("OLS search returned empty response for %s", name) - return None + if req.json()["response"]["numFound"] == 0: + if exact: + logger.debug("OLS exact search returned empty response for %s", name) + else: + logger.debug("OLS search returned empty response for %s", name) + return docs_found + elif len(req.json()["response"]["docs"]) < rows: + return req.json()["response"]["docs"] + else: + docs_found = req.json()["response"]["docs"] + docs_found.extend(self.search(name, query_fields=query_fields, ontology=ontology, + field_list=field_list, children_of=children_of, exact=exact, + bytype=bytype, rows=rows, num_retries=num_retries, + start=(rows + (start)))) + return docs_found + + if req.status_code == 200 and req.json()["response"]["numFound"] == 0: + if exact: + logger.debug("OLS exact search returned empty response for %s", name) + else: + logger.debug("OLS search returned empty response for %s", name) + return None + elif req.status_code != 200 and req.json()["response"]["numFound"] > 0: + if len(req.json()["response"]["docs"]) <= rows: + return req.json()["response"]["docs"] + else: + start = 0 + docs_found = req.json()["response"]["docs"] + except Exception as ex: - retry_num += 1 - logger.debug("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex) + logger.exception("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex) - return None + return docs_found def suggest(self, name, ontology=None): """Suggest terms from an optional list of ontologies From 2fd27a77856ce30f7de087cb05e3be4e4340dfd4 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 27 Feb 2024 16:05:12 +0000 Subject: [PATCH 05/10] check empty cells in sdrf --- sdrf_pipelines/sdrf/sdrf_schema.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index 69e37d42..d79a599b 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -3,6 +3,7 @@ import typing from typing import Any +import numpy as np import pandas as pd from pandas_schema import Column from pandas_schema import Schema @@ -135,8 +136,6 @@ def validate(self, series: pd.Series) -> pd.Series: terms = [ontology_term_parser(x) for x in series.unique()] labels = [] for term in terms: - if term['NT'] == 'clostridium perfringens': - print(term) if TERM_NAME not in term: ontology_terms = None else: @@ -181,6 +180,10 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]: ) errors.append(LogicError(error_message, error_type=logging.WARN)) + empty_cells_errors = self.validate_empty_cells(panda_sdrf) + if empty_cells_errors: + errors.extend(empty_cells_errors) + # Check the mandatory fields error_mandatory = self.validate_mandatory_columns(panda_sdrf) if error_mandatory is not None: @@ -312,6 +315,28 @@ def check_recommendations(self, panda_sdrf): warnings += column.validate_optional(series) return sorted(warnings, key=lambda e: e.row) + def validate_empty_cells(self, panda_sdrf): + """ + Check for empty cells in the SDRF. This method will return a list of errors if any empty cell is found. + :param panda_sdrf: SDRF dataframe + :return: List of errors + """ + errors = [] + def validate_string(string): + return len(string.strip()) > 0 + + # Apply the validation function element-wise + validation_results = panda_sdrf.map(lambda x: validate_string(x)) + + # Get the indices where the validation fails + failed_indices = [(row, col) for row in validation_results.index for col in validation_results.columns if + not validation_results.at[row, col]] + + for row, col in failed_indices: + message = f"Empty value found Row: {row}, Column: {col}" + errors.append(LogicError(message, error_type=logging.ERROR)) + return errors + default_schema = SDRFSchema( [ From d88685bd0f9d1571e63c9af142ca618e2626d991 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 27 Feb 2024 16:10:01 +0000 Subject: [PATCH 06/10] version increased --- sdrf_pipelines/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdrf_pipelines/__init__.py b/sdrf_pipelines/__init__.py index 5681085f..5a6b518d 100644 --- a/sdrf_pipelines/__init__.py +++ b/sdrf_pipelines/__init__.py @@ -1 +1 @@ -__version__ = "0.0.24" +__version__ = "0.0.25" From 948c0c6b63e12c763614db57a0a8e35eb3e89c61 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 27 Feb 2024 16:38:59 +0000 Subject: [PATCH 07/10] only returning exact terms. --- sdrf_pipelines/sdrf/sdrf_schema.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index d79a599b..f8a9456a 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -68,12 +68,12 @@ def ontology_term_parser(cell_value: str = None): class SDRFColumn(Column): def __init__( - self, - name: str, - validations: typing.Iterable["_BaseValidation"] = None, - optional_validations: typing.Iterable["_BaseValidation"] = None, - allow_empty=False, - optional_type=True, + self, + name: str, + validations: typing.Iterable["_BaseValidation"] = None, + optional_validations: typing.Iterable["_BaseValidation"] = None, + allow_empty=False, + optional_type=True, ): if validations is None: validations = [] @@ -146,8 +146,8 @@ def validate(self, series: pd.Series) -> pd.Series: if ontology_terms is not None: query_labels = [o["label"].lower() for o in ontology_terms] - for label in query_labels: - labels.append(label) + if term[TERM_NAME] in query_labels: + labels.append(term[TERM_NAME]) if self._not_available: labels.append(NOT_AVAILABLE) if self._not_applicable: @@ -223,9 +223,9 @@ def validate_column_names(self, panda_sdrf): errors.append(cname) elif m.group().startswith("factor value"): if ( - m.group().replace("factor value", "comment") not in panda_sdrf.columns - and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns - and m.group() not in panda_sdrf.columns + m.group().replace("factor value", "comment") not in panda_sdrf.columns + and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns + and m.group() not in panda_sdrf.columns ): error_message = "The " + cname + " column should also be in the characteristics or comment" logerror.append(LogicError(error_message, error_type=logging.ERROR)) @@ -265,7 +265,7 @@ def validate_columns_order(panda_sdrf): error_message = "The column " + column + "cannot be before the assay name" error_columns_order.append(LogicError(error_message, error_type=logging.ERROR)) if ( - "characteristics" in column or ("material type" in column and "factor value" not in column) + "characteristics" in column or ("material type" in column and "factor value" not in column) ) and cnames.index(column) > index: error_message = "The column " + column + "cannot be after the assay name" error_columns_order.append(LogicError(error_message, error_type=logging.ERROR)) @@ -322,6 +322,7 @@ def validate_empty_cells(self, panda_sdrf): :return: List of errors """ errors = [] + def validate_string(string): return len(string.strip()) > 0 From 3649fc251ba10bc5b882095ecb35768844c02af0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 28 Feb 2024 06:15:30 +0000 Subject: [PATCH 08/10] only returning exact terms. --- sdrf_pipelines/sdrf/sdrf_schema.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index f8a9456a..de110181 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -2,8 +2,6 @@ import re import typing from typing import Any - -import numpy as np import pandas as pd from pandas_schema import Column from pandas_schema import Schema From cd8bd10c5f1f1b5f5d0da02f3b367d4c3b16549a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 28 Feb 2024 06:41:09 +0000 Subject: [PATCH 09/10] only returning exact terms. --- sdrf_pipelines/sdrf/sdrf_schema.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index de110181..a2bb22b5 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -2,6 +2,8 @@ import re import typing from typing import Any + +import numpy as np import pandas as pd from pandas_schema import Column from pandas_schema import Schema @@ -321,11 +323,11 @@ def validate_empty_cells(self, panda_sdrf): """ errors = [] - def validate_string(string): - return len(string.strip()) > 0 + def validate_string(cell_value): + return cell_value is not None and cell_value != "nan" and len(cell_value.strip()) > 0 # Apply the validation function element-wise - validation_results = panda_sdrf.map(lambda x: validate_string(x)) + validation_results = panda_sdrf.map(validate_string) # Get the indices where the validation fails failed_indices = [(row, col) for row in validation_results.index for col in validation_results.columns if From f5a449cb5a515a61fa170c1c002b31dc12be8e2a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 28 Feb 2024 07:18:14 +0000 Subject: [PATCH 10/10] only returning exact terms. --- sdrf_pipelines/sdrf/sdrf_schema.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdrf_pipelines/sdrf/sdrf_schema.py b/sdrf_pipelines/sdrf/sdrf_schema.py index a2bb22b5..34b53717 100644 --- a/sdrf_pipelines/sdrf/sdrf_schema.py +++ b/sdrf_pipelines/sdrf/sdrf_schema.py @@ -3,7 +3,6 @@ import typing from typing import Any -import numpy as np import pandas as pd from pandas_schema import Column from pandas_schema import Schema