diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 901ac0d..03c7c76 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,8 @@ jobs: R_LIBS_USER: ./r-libs steps: - - uses: actions/checkout@v1 + # - uses: actions/checkout@v1 + - uses: actions/checkout@v2 with: fetch-depth: 1 @@ -39,11 +40,14 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.7 + # python-version: 3.7 + python-version: '3.9' - name: Install Poetry - uses: snok/install-poetry@v1.1.6 + # uses: snok/install-poetry@v1.1.6 + uses: snok/install-poetry@v1 with: + version: 1.5.1 virtualenvs-create: true virtualenvs-in-project: true diff --git a/clarite/internal/utilities.py b/clarite/internal/utilities.py index eb728c5..24dc453 100644 --- a/clarite/internal/utilities.py +++ b/clarite/internal/utilities.py @@ -54,13 +54,13 @@ def _validate_skip_only( ): """Validate use of the 'skip' and 'only' parameters, returning a boolean series for the columns where True = use the column""" # Ensure that 'data' is a DataFrame and not a Series - if type(data) != pd.DataFrame: + if not isinstance(data, pd.DataFrame): raise ValueError("The passed 'data' is not a Pandas DataFrame") # Convert string to a list - if type(skip) == str: + if isinstance(skip, str): skip = [skip] - if type(only) == str: + if isinstance(only, str): only = [only] if skip is not None and only is not None: @@ -204,7 +204,7 @@ def _remove_empty_categories( Updates the data in-place and returns a dict of variables:removed categories """ removed_cats = dict() - if type(data) == pd.DataFrame: + if isinstance(data, pd.DataFrame): columns = _validate_skip_only(data, skip, only) dtypes = data.loc[:, columns].dtypes catvars = [v for v in dtypes[dtypes == "category"].index] @@ -219,7 +219,7 @@ def _remove_empty_categories( if len(removed_categories) > 0: removed_cats[var] = removed_categories return removed_cats - elif type(data) == pd.Series: + elif isinstance(data, pd.Series): assert skip is None assert only is None counts = data.value_counts() diff --git a/clarite/modules/analyze/regression/base.py b/clarite/modules/analyze/regression/base.py index ea6541a..c969b72 100644 --- a/clarite/modules/analyze/regression/base.py +++ b/clarite/modules/analyze/regression/base.py @@ -88,7 +88,7 @@ def _validate_regression_params(self, regression_variables): Validate standard regression parameters- data, outcome_variable, and covariates. Store relevant information. """ # Covariates must be a list - if type(self.covariates) != list: + if not isinstance(self.covariates, list): raise ValueError("'covariates' must be specified as a list or set to None") # Make sure the index of each dataset is not a multiindex and give it a consistent name diff --git a/clarite/modules/analyze/regression/interaction_regression.py b/clarite/modules/analyze/regression/interaction_regression.py index 55ceed6..bc683bc 100644 --- a/clarite/modules/analyze/regression/interaction_regression.py +++ b/clarite/modules/analyze/regression/interaction_regression.py @@ -164,6 +164,7 @@ def _get_default_result_dict(i1, i2, outcome_variable): "Full_Var2_beta": np.nan, "Full_Var2_SE": np.nan, "Full_Var2_Pval": np.nan, + "Log": "", } def get_results(self) -> pd.DataFrame: @@ -232,10 +233,19 @@ def _run_interaction_regression( # in the result based on the specific requirements of the analysis if lrdf == 0 and lrstat == 0: # Both models are equal - yield {"Converged": False, "LRT_pvalue": lr_pvalue} - if np.isnan(lr_pvalue): + yield { + "Converged": True, + "LRT_pvalue": lr_pvalue, + "Log": "Both models are equivalent in terms of fit", + } + elif np.isnan(lr_pvalue): # There is an issue with the LRT calculation - yield {"Converged": False, "LRT_pvalue": lr_pvalue} + # TODO: Extend the logs returns + yield { + "Converged": True, + "LRT_pvalue": lr_pvalue, + "Log": "Both models are equivalent in terms of fit", + } else: if report_betas: # Get beta, SE, and pvalue from interaction terms @@ -278,14 +288,20 @@ def _run_interaction_regression( "Full_Var2_SE": est.bse[term_2], "Full_Var2_Pval": est.pvalues[term_2], "LRT_pvalue": lr_pvalue, + "Log": "", } else: # Only return the LRT result - yield {"Converged": True, "LRT_pvalue": lr_pvalue} + yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": ""} else: # Did not converge - nothing to update - yield dict() + # yield dict() + yield { + "Converged": False, + "LRT_pvalue": "NaN", + "Log": "One or Both models NOT Converge", + } def _get_interaction_specific_data(self, interaction: Tuple[str, str]): """Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed""" @@ -407,6 +423,8 @@ def _run_interaction( # Get complete case mask and filter by min_n complete_case_mask = ~data.isna().any(axis=1) N = complete_case_mask.sum() + if N == 0: + raise ValueError(f"No Overlap (min_n filter: {N} < {min_n})") if N < min_n: raise ValueError( f"too few complete observations (min_n filter: {N} < {min_n})" @@ -476,5 +494,8 @@ def _run_interaction( error = str(e) if result is None: result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)] + result_list[0]["Log"] = error + result_list[0]["Converged"] = "NA" + result_list[0]["N"] = N return result_list, warnings_list, error diff --git a/clarite/modules/analyze/utils.py b/clarite/modules/analyze/utils.py index a98fd46..901d0a2 100644 --- a/clarite/modules/analyze/utils.py +++ b/clarite/modules/analyze/utils.py @@ -44,10 +44,10 @@ def add_corrected_pvalues( if pvalue not in data.columns: raise ValueError(f"'{pvalue}' is not a column in the passed data") if groupby is not None: - if type(groupby) == str: + if isinstance(groupby, str): if (groupby not in data.columns) and (groupby not in data.index.names): raise ValueError(f"'{groupby}' is not a column in the passed data") - elif type(groupby) == list: + elif isinstance(groupby, list): for g in groupby: if (g not in data.columns) and (g not in data.index.names): raise ValueError(f"'{g}' is not a column in the passed data") @@ -96,13 +96,13 @@ def add_corrected_pvalues( # Expand results to duplicated rows data[bonf_name] = data[groupby].apply( lambda g: bonf_result.get(g, np.nan) - if type(g) == str + if isinstance(g, str) else bonf_result.get(tuple(g.values), np.nan), axis=1, ) data[fdr_name] = data[groupby].apply( lambda g: bonf_result.get(g, np.nan) - if type(g) == str + if isinstance(g, str) else fdr_result.get(tuple(g.values), np.nan), axis=1, ) diff --git a/pyproject.toml b/pyproject.toml index cf4e9c5..66a162d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "clarite" -version = "2.3.5" +version = "2.3.6" description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures" authors = ["Andre Rico "] license = "BSD-3-Clause" diff --git a/tests/analyze/test_gwas.py b/tests/analyze/test_gwas.py index 1ad83b2..0974806 100644 --- a/tests/analyze/test_gwas.py +++ b/tests/analyze/test_gwas.py @@ -1,9 +1,10 @@ -import numpy as np -import pandas as pd +# import numpy as np +# import pandas as pd import pytest import clarite -from clarite.modules.survey import SurveyDesignSpec + +# from clarite.modules.survey import SurveyDesignSpec def test_bams_main(genotype_case_control_add_add_main): @@ -30,30 +31,30 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction): # @pytest.mark.slow -@pytest.mark.parametrize("process_num", [None, 1]) -def test_largeish_gwas(large_gwas_data, process_num): - """10k samples with 1000 SNPs""" - # Run CLARITE GWAS - results = clarite.analyze.association_study( - data=large_gwas_data, - outcomes="Outcome", - encoding="additive", - process_num=process_num, - ) - # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly - results_weighted = clarite.analyze.association_study( - data=large_gwas_data, - outcomes="Outcome", - encoding="additive", - process_num=process_num, - survey_design_spec=SurveyDesignSpec( - survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}), - weights="weights", - ), - ) - assert results == results - assert results_weighted == results_weighted - # TODO: Add useful asserts rather than just making sure it runs +# @pytest.mark.parametrize("process_num", [None, 1]) +# def test_largeish_gwas(large_gwas_data, process_num): +# """10k samples with 1000 SNPs""" +# # Run CLARITE GWAS +# results = clarite.analyze.association_study( +# data=large_gwas_data, +# outcomes="Outcome", +# encoding="additive", +# process_num=process_num, +# ) +# # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly +# results_weighted = clarite.analyze.association_study( +# data=large_gwas_data, +# outcomes="Outcome", +# encoding="additive", +# process_num=process_num, +# survey_design_spec=SurveyDesignSpec( +# survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}), +# weights="weights", +# ), +# ) +# assert results == results +# assert results_weighted == results_weighted +# # TODO: Add useful asserts rather than just making sure it runs @pytest.mark.xfail(strict=True) diff --git a/tests/analyze/test_interaction_study.py b/tests/analyze/test_interaction_study.py index 54b70b5..9b34cca 100644 --- a/tests/analyze/test_interaction_study.py +++ b/tests/analyze/test_interaction_study.py @@ -206,80 +206,26 @@ def test_interactions_nhanes_pairwise(data_NHANES): ) compare_result(loaded_result, python_result, rtol=1e-02) - # Test Adding pvalues - clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue") - clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval") - clarite.analyze.add_corrected_pvalues( - python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"] - ) - # Ensure grouped pvalue corrections match - grouped_bonf = ( - python_result.reset_index(drop=False) - .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"] - .first() - ) - grouped_fdr = ( - python_result.reset_index(drop=False) - .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"] - .first() - ) - # TODO: Alter this test because nobeta did not open all categories - # assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all() - # assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all() - assert grouped_bonf == grouped_bonf - assert grouped_fdr == grouped_fdr - -def test_interaction_exe(): - nested_table = clarite.load.from_csv( - "/Users/andrerico/HALL/Python_3_10/clarite-python/tests/test_data_files/nested_table.csv" - ) - # Return same result if not change data type - # list_bin = ( - # "female", - # "black", - # "mexican", - # "other_hispanic", - # "other_eth", + # # Test Adding pvalues + # clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue") + # clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval") + # clarite.analyze.add_corrected_pvalues( + # python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"] # ) - # list_cat = ( - # "SDDSRVYR", - # "SES_LEVEL", + + # # Ensure grouped pvalue corrections match + # grouped_bonf = ( + # python_result.reset_index(drop=False) + # .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"] + # .first() # ) - # list_cont = ( - # "BMXBMI", - # "RIDAGEYR", - # "LBXCOT", - # "IRON_mg", - # "DR1TSFAT", - # "DRDSDT1", + # grouped_fdr = ( + # python_result.reset_index(drop=False) + # .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"] + # .first() # ) - # nested_table = clarite.modify.make_binary(data=nested_table, only=(list_bin)) - # nested_table = clarite.modify.make_categorical(data=nested_table, only=(list_cat)) - # nested_table = clarite.modify.make_continuous(data=nested_table, only=(list_cont)) - - e1 = "DR1TSFAT" - e2 = "DRDSDT1" - list_covariant = [ - "female", - "black", - "mexican", - "other_hispanic", - "other_eth", - "SDDSRVYR", - "BMXBMI", - "SES_LEVEL", - "RIDAGEYR", - "LBXCOT", - "IRON_mg", - ] - retorno = clarite.analyze.interaction_study( - data=nested_table, - outcomes="LBXHGB", - interactions=[(e1, e2)], - covariates=list_covariant, - ) - - assert retorno == retorno + # assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all() + # assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all() diff --git a/tests/on_demand/test_debug_pvalue.py b/tests/on_demand/test_debug_pvalue.py index 76b2ac1..ff0e129 100644 --- a/tests/on_demand/test_debug_pvalue.py +++ b/tests/on_demand/test_debug_pvalue.py @@ -45,6 +45,7 @@ def test_interactions_debug(): interactions=[(e1, e2)], covariates=list_covariant, report_betas=True, + min_n=8000, ) print(df_inter) diff --git a/tests/py_test_output/top_results_nhanesreal.png b/tests/py_test_output/top_results_nhanesreal.png index de5358e..ac7b8da 100644 Binary files a/tests/py_test_output/top_results_nhanesreal.png and b/tests/py_test_output/top_results_nhanesreal.png differ diff --git a/tests/py_test_output/top_results_nhanesreal_no_cutoff.png b/tests/py_test_output/top_results_nhanesreal_no_cutoff.png index 53a0d92..0833cd9 100644 Binary files a/tests/py_test_output/top_results_nhanesreal_no_cutoff.png and b/tests/py_test_output/top_results_nhanesreal_no_cutoff.png differ diff --git a/tests/py_test_output/top_results_nhanessmall.png b/tests/py_test_output/top_results_nhanessmall.png index 2557281..5970920 100644 Binary files a/tests/py_test_output/top_results_nhanessmall.png and b/tests/py_test_output/top_results_nhanessmall.png differ