From 78fbcea72b9dcc8fb047b50e26fde656aae2fbe2 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 14:41:42 -0400 Subject: [PATCH 01/29] add continous integration --- .github/workflows/ci.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..52f1904 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,28 @@ +name: continuous-integration + +on: [push] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] # remove mac tests + # Latest pyOpenMS supports Python 3.10, and 3.11 + python-version: ["3.10", "3.11"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + pip install . + + - name: Test + run: | + python -m pytest tests/ From b34300b03e7b85a3baa1fa05376fa460c45afd6e Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 14:44:20 -0400 Subject: [PATCH 02/29] preinstall numpy --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52f1904..db704f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pytest + pip install numpy pip install . - name: Test From 955e1f13e095b8325d5e4183cb7a4a1024660830 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 14:53:07 -0400 Subject: [PATCH 03/29] remove numpy from setup --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d7e7621..ac191ee 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ import sys -import numpy +#import numpy from setuptools import setup, find_packages from distutils.extension import Extension @@ -36,7 +36,6 @@ url="https://github.com/PyProphet/pyprophet", packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), include_package_data=True, - include_dirs=[numpy.get_include()], classifiers=[ 'Development Status :: 3 - Alpha', 'Environment :: Console', From a6841366bc53cc588dddca8aeb0dbde36419c416 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 14:56:45 -0400 Subject: [PATCH 04/29] install numpy in setup script --- setup.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ac191ee..faadc30 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,15 @@ import sys -#import numpy from setuptools import setup, find_packages from distutils.extension import Extension +try: + import numpy +except ImportError: + print("Numpy is not installed. Installing it now.") + import subprocess + subprocess.check_call(['pip', 'install', 'numpy']) + import numpy + try: from Cython.Build import cythonize except ImportError: @@ -36,6 +43,7 @@ url="https://github.com/PyProphet/pyprophet", packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), include_package_data=True, + include_dirs=[numpy.get_include()], classifiers=[ 'Development Status :: 3 - Alpha', 'Environment :: Console', From ff5804c0effb5cf7da838926faa54a2289b34a6f Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 15:04:39 -0400 Subject: [PATCH 05/29] convert to .toml setup convert to .toml setup for building on github --- pyproject.toml | 50 ++++++++++++++++++++++++++++ setup.py | 89 +++++++++----------------------------------------- 2 files changed, 66 insertions(+), 73 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4a665c0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["setuptools", "wheel", "numpy", "cython"] # Dependencies needed to build the package +build-backend = "setuptools.build_meta" + +[project] +name = "pyprophet" +version = "2.2.8" +description = "PyProphet: Semi-supervised learning and scoring of OpenSWATH results." +readme = { file = "README.md", content-type = "text/markdown" } +license = { text = "BSD" } +authors = [{ name = "The PyProphet Developers", email = "rocksportrocker@gmail.com" }] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Chemistry" +] +keywords = ["bioinformatics", "openSWATH", "mass spectrometry"] + +# Dependencies required for runtime +dependencies = [ + "Click", + "duckdb", + "numpy >= 1.9.0", + "scipy", + "pandas >= 0.17", + "cython", + "numexpr >= 2.10.1", + "scikit-learn >= 0.17", + "xgboost", + "hyperopt", + "statsmodels >= 0.8.0", + "matplotlib", + "tabulate", + "pyarrow", + "pypdf" +] + +# Define console entry points +[project.scripts] +pyprophet = "pyprophet.main:cli" + +[tool.setuptools] +packages = ["pyprophet"] +include-package-data = true +zip-safe = false + diff --git a/setup.py b/setup.py index faadc30..489ea94 100644 --- a/setup.py +++ b/setup.py @@ -1,80 +1,23 @@ import sys -from setuptools import setup, find_packages -from distutils.extension import Extension +from setuptools import setup, Extension +from Cython.Build import cythonize +import numpy -try: - import numpy -except ImportError: - print("Numpy is not installed. Installing it now.") - import subprocess - subprocess.check_call(['pip', 'install', 'numpy']) - import numpy +use_cython = True +ext = ".pyx" if use_cython else ".c" -try: - from Cython.Build import cythonize -except ImportError: - use_cython = False -else: - use_cython = True - -cmdclass = {} -ext_modules = [] +extensions = [ + Extension( + "pyprophet._optimized", + [f"pyprophet/_optimized{ext}"], + include_dirs=[numpy.get_include()], + ) +] if use_cython: - ext_modules += [Extension("pyprophet._optimized", ["pyprophet/_optimized.pyx"])] - ext_modules = cythonize(ext_modules) -else: - ext_modules += [Extension("pyprophet._optimized", ["pyprophet/_optimized.c"])] + extensions = cythonize(extensions) -# read the contents of README for PyPI -from os import path -this_directory = path.abspath(path.dirname(__file__)) -with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: - long_description = f.read() +setup( + ext_modules=extensions, +) -setup(name='pyprophet', - version="2.2.8", - author="The PyProphet Developers", - author_email="rocksportrocker@gmail.com", - description="PyProphet: Semi-supervised learning and scoring of OpenSWATH results.", - long_description=long_description, - long_description_content_type='text/markdown', - license="BSD", - url="https://github.com/PyProphet/pyprophet", - packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), - include_package_data=True, - include_dirs=[numpy.get_include()], - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Environment :: Console', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Chemistry', - ], - zip_safe=False, - install_requires=[ - "Click", - "duckdb", - "numpy >= 1.9.0", - "scipy", - "pandas >= 0.17", - "cython", - "numexpr >= 2.10.1", - "scikit-learn >= 0.17", - "xgboost", - "hyperopt", - "statsmodels >= 0.8.0", - "matplotlib", - "tabulate", - "pyarrow", - "pypdf" - ], - entry_points={ - 'console_scripts': [ - "pyprophet=pyprophet.main:cli", - ] - }, - ext_modules=ext_modules, - ) From 4b6750ad33585b5c3d8bcca2bf9db2e52cc21244 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 15:12:48 -0400 Subject: [PATCH 06/29] remove numpy from requirement --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db704f8..52f1904 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,6 @@ jobs: run: | python -m pip install --upgrade pip pip install pytest - pip install numpy pip install . - name: Test From 122753d3afbd2c0e2c182d0c16538ebaf1463cc0 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 15:13:56 -0400 Subject: [PATCH 07/29] just ubuntu for now --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52f1904..a64618f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] # remove mac tests + os: [ubuntu-latest] + #os: [ubuntu-latest, windows-latest, macos-latest] # remove mac tests # Latest pyOpenMS supports Python 3.10, and 3.11 python-version: ["3.10", "3.11"] steps: From b9f35afb7c064083804bca8fc86bb84b2ff945a4 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 16:31:15 -0400 Subject: [PATCH 08/29] fix setup.py and .toml both are required because of cython, works locally now --- pyproject.toml | 5 ++--- setup.py | 29 +++++++++++++---------------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4a665c0..173a84b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,6 @@ dependencies = [ pyprophet = "pyprophet.main:cli" [tool.setuptools] -packages = ["pyprophet"] +packages = { find = { exclude = ["ez_setup", "examples", "tests"] } } include-package-data = true -zip-safe = false - +zip-safe = false \ No newline at end of file diff --git a/setup.py b/setup.py index 489ea94..6dd7624 100644 --- a/setup.py +++ b/setup.py @@ -1,23 +1,20 @@ -import sys -from setuptools import setup, Extension +from setuptools import setup, Extension, find_packages from Cython.Build import cythonize import numpy -use_cython = True -ext = ".pyx" if use_cython else ".c" - -extensions = [ - Extension( - "pyprophet._optimized", - [f"pyprophet/_optimized{ext}"], - include_dirs=[numpy.get_include()], - ) -] +try: + from Cython.Build import cythonize +except ImportError: + use_cython = False +else: + use_cython = True +ext_modules = [] if use_cython: - extensions = cythonize(extensions) + ext_modules += [Extension("pyprophet._optimized", ["pyprophet/_optimized.pyx"])] + ext_modules = cythonize(ext_modules) +else: + ext_modules += [Extension("pyprophet._optimized", ["pyprophet/_optimized.c"])] -setup( - ext_modules=extensions, -) +setup(name='pyprophet', ext_modules=ext_modules, include_dirs=[numpy.get_include()]) From be62dfb461a6f4b763ea60b02e4c52a15a2b80ec Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Oct 2024 16:42:07 -0400 Subject: [PATCH 09/29] add line to build extension --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a64618f..5f0f093 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,9 @@ jobs: pip install pytest pip install . + - name: Compile cython module + run: python setup.py build_ext --inplace + - name: Test run: | python -m pytest tests/ From 15dd65babb4e1b1b8f1da7bd7dffb1469d6bdc57 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 31 Oct 2024 16:58:30 -0400 Subject: [PATCH 10/29] fix: stats tests --- pyprophet/stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyprophet/stats.py b/pyprophet/stats.py index 6934e87..349ef7f 100644 --- a/pyprophet/stats.py +++ b/pyprophet/stats.py @@ -233,7 +233,7 @@ def pi0est(p_values, lambda_ = np.arange(0.05,1.0,0.05), pi0_method = "smoother" @profile def qvalue(p_values, pi0, pfdr = False): - p = np.array(p_values) + p = np.array(p_values).copy() qvals_out = p rm_na = np.isfinite(p) @@ -277,7 +277,7 @@ def bw_nrd0(x): @profile def lfdr(p_values, pi0, trunc = True, monotone = True, transf = "probit", adj = 1.5, eps = np.power(10.0,-8)): """ Estimate local FDR / posterior error probability from p-values according to bioconductor/qvalue """ - p = np.array(p_values) + p = np.array(p_values).copy() # Compare to bioconductor/qvalue reference implementation # import rpy2 From c99db26a06be12917b0a49343ced8eada1895db1 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 31 Oct 2024 17:27:14 -0400 Subject: [PATCH 11/29] add pytest-regtest to workflow --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f0f093..b33d29d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,6 +22,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pytest + pip install pytest-regtest pip install . - name: Compile cython module From fdb5513818b4280d8478e9720dc1c26257cb2ee5 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 1 Nov 2024 10:42:09 -0400 Subject: [PATCH 12/29] update autotunning so does not fail note tests still fail though, possibly because random seed is different? --- pyprophet/classifiers.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pyprophet/classifiers.py b/pyprophet/classifiers.py index 750bc82..0aefd04 100644 --- a/pyprophet/classifiers.py +++ b/pyprophet/classifiers.py @@ -110,7 +110,8 @@ def objective(params): clf = xgb.XGBClassifier(random_state=42, verbosity=0, objective='binary:logitraw', eval_metric='auc', **params) - score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=np.random.RandomState(42))).mean() + rng = np.random.default_rng(42) + score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=42)).mean() # click.echo("Info: AUC: {:.3f} hyperparameters: {}".format(score, params)) return score @@ -129,7 +130,8 @@ def objective(params): xgb_params_complexity = self.xgb_params_tuned xgb_params_complexity.update({k: self.xgb_params_space[k] for k in ('max_depth', 'min_child_weight')}) - best_complexity = fmin(fn=objective, space=xgb_params_complexity, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42)) + rng = np.random.default_rng(42) + best_complexity = fmin(fn=objective, space=xgb_params_complexity, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng) best_complexity['max_depth'] = int(best_complexity['max_depth']) best_complexity['min_child_weight'] = int(best_complexity['min_child_weight']) @@ -139,7 +141,7 @@ def objective(params): xgb_params_gamma = self.xgb_params_tuned xgb_params_gamma['gamma'] = self.xgb_params_space['gamma'] - best_gamma = fmin(fn=objective, space=xgb_params_gamma, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42)) + best_gamma = fmin(fn=objective, space=xgb_params_gamma, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng) self.xgb_params_tuned.update(best_gamma) @@ -147,7 +149,7 @@ def objective(params): xgb_params_subsampling = self.xgb_params_tuned xgb_params_subsampling.update({k: self.xgb_params_space[k] for k in ('subsample', 'colsample_bytree', 'colsample_bylevel', 'colsample_bynode')}) - best_subsampling = fmin(fn=objective, space=xgb_params_subsampling, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42)) + best_subsampling = fmin(fn=objective, space=xgb_params_subsampling, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng) self.xgb_params_tuned.update(best_subsampling) @@ -155,7 +157,7 @@ def objective(params): xgb_params_regularization = self.xgb_params_tuned xgb_params_regularization.update({k: self.xgb_params_space[k] for k in ('lambda', 'alpha')}) - best_regularization = fmin(fn=objective, space=xgb_params_regularization, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42)) + best_regularization = fmin(fn=objective, space=xgb_params_regularization, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng) self.xgb_params_tuned.update(best_regularization) @@ -163,7 +165,7 @@ def objective(params): xgb_params_learning = self.xgb_params_tuned xgb_params_learning['eta'] = self.xgb_params_space['eta'] - best_learning = fmin(fn=objective, space=xgb_params_learning, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42)) + best_learning = fmin(fn=objective, space=xgb_params_learning, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng) self.xgb_params_tuned.update(best_learning) click.echo("Info: Optimal hyperparameters: {}".format(self.xgb_params_tuned)) From 3c8bfbdcbf85bd30a3d6310ecd66512929a875ea Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 1 Nov 2024 12:47:14 -0400 Subject: [PATCH 13/29] fix: fix level context tests --- pyprophet/levels_contexts.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pyprophet/levels_contexts.py b/pyprophet/levels_contexts.py index c7ce194..edc3602 100644 --- a/pyprophet/levels_contexts.py +++ b/pyprophet/levels_contexts.py @@ -33,7 +33,12 @@ def statistics_report(data, outfile, context, analyte, parametric, pfdr, pi0_lam outfile = outfile + "_" + str(data['run_id'].unique()[0]) # export PDF report - save_report(outfile + "_" + context + "_" + analyte + ".pdf", outfile + ": " + context + " " + analyte + "-level error-rate control", data[data.decoy==1]["score"], data[data.decoy==0]["score"], stat_table["cutoff"], stat_table["svalue"], stat_table["qvalue"], data[data.decoy==0]["p_value"], pi0, color_palette) + save_report(outfile + "_" + context + "_" + analyte + ".pdf", + outfile + ": " + context + " " + analyte + "-level error-rate control", + data[data.decoy==1]["score"].values, data[data.decoy==0]["score"].values, stat_table["cutoff"].values, + stat_table["svalue"].values, stat_table["qvalue"].values, data[data.decoy==0]["p_value"].values, + pi0, + color_palette) return(data) @@ -184,7 +189,7 @@ def infer_proteins(infile, outfile, context, parametric, pfdr, pi0_lambda, pi0_m con.close() if context == 'run-specific': - data = data.groupby('run_id').apply(statistics_report, outfile, context, "protein", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette).reset_index() + data = data.groupby('run_id').apply(statistics_report, outfile, context, "protein", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette) elif context in ['global', 'experiment-wide']: data = statistics_report(data, outfile, context, "protein", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette) @@ -257,7 +262,7 @@ def infer_peptides(infile, outfile, context, parametric, pfdr, pi0_lambda, pi0_m con.close() if context == 'run-specific': - data = data.groupby('run_id').apply(statistics_report, outfile, context, "peptide", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette).reset_index() + data = data.groupby('run_id').apply(statistics_report, outfile, context, "peptide", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette) elif context in ['global', 'experiment-wide']: data = statistics_report(data, outfile, context, "peptide", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette) From d320945efd572b562965f5a605173aaf9cba76dc Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 1 Nov 2024 13:08:44 -0400 Subject: [PATCH 14/29] update export-parquet tests and fix tests update the tests to remove old parameters and fix tests that were failing --- pyprophet/export_parquet.py | 2 +- tests/test_pyprophet_export_parquet.py | 21 +++------------------ 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/pyprophet/export_parquet.py b/pyprophet/export_parquet.py index fe0d3ed..795a12a 100644 --- a/pyprophet/export_parquet.py +++ b/pyprophet/export_parquet.py @@ -172,7 +172,7 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): # transition level if transitionLevel: - columns['FEATURE_TRANSITION'] = ['AREA_INTENSITY', 'TOTAL_AREA_INTENSITY', 'APEX_INTENSITY', 'TOTAL_MI'] + getVarColumnNames(condb, 'FEATURE_TRANSITION') + columns['FEATURE_TRANSITION'] = ['AREA_INTENSITY', 'TOTAL_AREA_INTENSITY', 'APEX_INTENSITY', 'TOTAL_MI'] + getVarColumnNames(con, 'FEATURE_TRANSITION') columns['TRANSITION'] = ['TRAML_ID', 'PRODUCT_MZ', 'CHARGE', 'TYPE', 'ORDINAL', 'DETECTING', 'IDENTIFYING', 'QUANTIFYING', 'LIBRARY_INTENSITY'] columns['TRANSITION_PRECURSOR_MAPPING'] = ['TRANSITION_ID'] diff --git a/tests/test_pyprophet_export_parquet.py b/tests/test_pyprophet_export_parquet.py index b4f3dba..6c01696 100644 --- a/tests/test_pyprophet_export_parquet.py +++ b/tests/test_pyprophet_export_parquet.py @@ -27,14 +27,14 @@ def _run_cmdline(cmdline): return stdout -def _run_export_parquet_single_run(temp_folder, transitionLevel=False, threads=1, chunksize=1000, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False): +def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False): os.chdir(temp_folder) DATA_NAME="dummyOSWScoredData.osw" data_path = os.path.join(DATA_FOLDER, DATA_NAME) conn = sqlite3.connect(DATA_NAME) shutil.copy(data_path, temp_folder) - cmdline = "pyprophet export-parquet --in={} --threads={} --chunksize={}".format(DATA_NAME, threads, chunksize) + cmdline = "pyprophet export-parquet --in={}".format(DATA_NAME) # if testing transition level add --transitionLevel flag if transitionLevel: @@ -112,19 +112,4 @@ def test_export_parquet_single_run_onlyFeatures(tmpdir): def test_export_parquet_single_run_transitionLevel_onlyFeatures(tmpdir): - _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True) - - -def test_multithread_export_parquet_single_run(tmpdir): - _run_export_parquet_single_run(tmpdir, transitionLevel=False, threads=2, chunksize=2) - -def test_multithread_export_parquet_single_run_transitionLevel(tmpdir): - _run_export_parquet_single_run(tmpdir, transitionLevel=True, threads=2, chunksize=2) - - -def test_multithread_export_parquet_single_run_onlyFeatures(tmpdir): - _run_export_parquet_single_run(tmpdir, onlyFeatures=True, threads=2, chunksize=4) - - -def test_multithread_export_parquet_single_run_transitionLevel_onlyFeatures(tmpdir): - _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True, threads=2, chunksize=4) + _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True) \ No newline at end of file From 71804e413f1ccb07ddb6abdbf56de0a323c36cce Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 20 Nov 2024 08:26:43 -0500 Subject: [PATCH 15/29] raise error if standard deviation computed is 0 This is appropriate since mean and standard deviation are always computed in the context of normalization and cannot normalize if the standard deviation is 0 --- pyprophet/stats.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyprophet/stats.py b/pyprophet/stats.py index 349ef7f..f037e27 100644 --- a/pyprophet/stats.py +++ b/pyprophet/stats.py @@ -118,7 +118,10 @@ def posterior_chromatogram_hypotheses_fast(experiment, prior_chrom_null): def mean_and_std_dev(values): - return np.mean(values), np.std(values, ddof=1) + std = np.std(values, ddof=1) + if std == 0: + raise RuntimeError("Computed standard deviation is 0, cannot perform normalization") + return np.mean(values), std def pnorm(stat, stat0): From 516389e37244439d2081d7a26571b975a21e3aae Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 11:31:21 -0500 Subject: [PATCH 16/29] test: set tree method as exact for tests --- pyprophet/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyprophet/main.py b/pyprophet/main.py index b3257ea..24f97ab 100644 --- a/pyprophet/main.py +++ b/pyprophet/main.py @@ -106,6 +106,8 @@ def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fracti xgb_hyperparams = {'autotune': xgb_autotune, 'autotune_num_rounds': 10, 'num_boost_round': 100, 'early_stopping_rounds': 10, 'test_size': 0.33} xgb_params = {'eta': 0.3, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': 1, 'alpha': 0, 'scale_pos_weight': 1, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'} + if test: + xgb_params['tree_method'] = 'exact' xgb_params_space = {'eta': hp.uniform('eta', 0.0, 0.3), 'gamma': hp.uniform('gamma', 0.0, 0.5), 'max_depth': hp.quniform('max_depth', 2, 8, 1), 'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1), 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': hp.uniform('lambda', 0.0, 1.0), 'alpha': hp.uniform('alpha', 0.0, 1.0), 'scale_pos_weight': 1.0, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'} From ca4a14eb83487b6ffba583f739e5c474a0258bc1 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 11:38:47 -0500 Subject: [PATCH 17/29] update snapshot tests --- .../test_pyprophet_score.test_osw_4.out | 20 +++++++++---------- .../test_pyprophet_score.test_osw_5.out | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/_regtest_outputs/test_pyprophet_score.test_osw_4.out b/tests/_regtest_outputs/test_pyprophet_score.test_osw_4.out index 8f458fc..bde61a3 100644 --- a/tests/_regtest_outputs/test_pyprophet_score.test_osw_4.out +++ b/tests/_regtest_outputs/test_pyprophet_score.test_osw_4.out @@ -1,14 +1,14 @@ feature_id ms1_precursor_pep ms2_peakgroup_pep ms2_precursor_pep -0 -4409520928686189639 0.0045 0.0045 0.1757 -1 -7771919224870429764 0.0045 0.0045 0.1757 -2 -797725006165535344 0.0045 0.0045 0.1291 -3 -1732939685941081620 0.0045 0.0045 0.1757 -4 -6747816958328369759 0.0045 0.0045 0.1757 +0 -4409520928686189639 0.0008 0.0024 0.8813 +1 -7771919224870429764 0.0008 0.0024 0.8813 +2 -797725006165535344 0.0008 0.0024 0.0883 +3 -1732939685941081620 0.0008 0.0024 0.8611 +4 -6747816958328369759 0.0008 0.0024 0.8813 .. ... ... ... ... -95 237580321205345393 0.0045 0.0045 0.1291 -96 5416940836005312912 0.0045 0.0045 0.1291 -97 -7541234528799769804 0.0045 0.0045 0.1757 -98 8036548921756545335 0.0045 0.0045 0.1291 -99 -6558503086717676095 0.0045 0.0045 0.1291 +95 -6034887541083502974 0.0008 0.0024 0.8461 +96 483971408708572459 0.0008 0.0024 0.0883 +97 5086440667566053402 0.0008 0.0024 0.9278 +98 7291105701317857435 0.0008 0.0024 0.8813 +99 237580321205345393 0.0008 0.0024 0.8461 [100 rows x 4 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_score.test_osw_5.out b/tests/_regtest_outputs/test_pyprophet_score.test_osw_5.out index e8d7f7c..8b58143 100644 --- a/tests/_regtest_outputs/test_pyprophet_score.test_osw_5.out +++ b/tests/_regtest_outputs/test_pyprophet_score.test_osw_5.out @@ -1,14 +1,14 @@ feature_id ms1_precursor_pep ms2_peakgroup_pep ms2_precursor_pep -0 -4409520928686189639 0.0045 0.0043 0.6924 -1 -7771919224870429764 0.0045 0.0043 0.6924 -2 -797725006165535344 0.0045 0.0043 0.3053 -3 -1732939685941081620 0.0045 0.0043 0.6924 -4 -6747816958328369759 0.0045 0.0043 0.6924 +0 -4409520928686189639 0.0008 0.004 0.2748 +1 -7771919224870429764 0.0008 0.004 0.3580 +2 -797725006165535344 0.0008 0.004 0.2370 +3 -1732939685941081620 0.0008 0.004 0.2748 +4 -6747816958328369759 0.0008 0.004 0.3580 .. ... ... ... ... -95 -5977524328878179832 0.0045 0.0043 0.6924 -96 -6034887541083502974 0.0045 0.0043 0.6924 -97 483971408708572459 0.0045 0.0043 0.1735 -98 5086440667566053402 0.0045 0.0043 0.6924 -99 7291105701317857435 0.0045 0.0043 0.6924 +95 8943629340769664660 0.0008 0.004 0.2370 +96 -6034887541083502974 0.0008 0.004 0.2748 +97 483971408708572459 0.0008 0.004 0.2370 +98 5086440667566053402 0.0008 0.004 0.3580 +99 7291105701317857435 0.0008 0.004 0.2748 [100 rows x 4 columns] From 0704475b119455350ddbf5e97cf676778f5f2681 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 12:07:03 -0500 Subject: [PATCH 18/29] update actions to dependent on reuiqrements file this fixes the versions so tests do not fail. Will add dependabot to update the reuqirements file --- .github/workflows/ci.yml | 7 +-- requirements.txt | 118 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 requirements.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b33d29d..85a120e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,8 +9,8 @@ jobs: matrix: os: [ubuntu-latest] #os: [ubuntu-latest, windows-latest, macos-latest] # remove mac tests - # Latest pyOpenMS supports Python 3.10, and 3.11 - python-version: ["3.10", "3.11"] + # Requirements file generated with python=3.11 + python-version: ["3.11"] steps: - uses: actions/checkout@v4 @@ -21,8 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pytest - pip install pytest-regtest + pip install -r requirements.txt # test with requirements file so can easily bump with dependabot pip install . - name: Compile cython module diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3129b63 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,118 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --all-extras --output-file=requirements.txt +# +click==8.1.7 + # via pyprophet (setup.py) +cloudpickle==3.1.0 + # via hyperopt +contourpy==1.3.0 + # via matplotlib +cycler==0.12.1 + # via matplotlib +cython==3.0.11 + # via pyprophet (setup.py) +duckdb==1.1.3 + # via + # duckdb-extension-sqlite-scanner + # duckdb-extensions + # pyprophet (setup.py) +duckdb-extension-sqlite-scanner==1.1.3 + # via pyprophet (setup.py) +duckdb-extensions==1.1.3 + # via pyprophet (setup.py) +fonttools==4.55.0 + # via matplotlib +future==1.0.0 + # via hyperopt +hyperopt==0.2.7 + # via pyprophet (setup.py) +iniconfig==2.0.0 + # via pytest +joblib==1.4.2 + # via scikit-learn +kiwisolver==1.4.7 + # via matplotlib +matplotlib==3.9.2 + # via pyprophet (setup.py) +networkx==3.2.1 + # via hyperopt +numexpr==2.10.1 + # via pyprophet (setup.py) +numpy==2.0.2 + # via + # contourpy + # hyperopt + # matplotlib + # numexpr + # pandas + # patsy + # pyprophet (setup.py) + # scikit-learn + # scipy + # statsmodels + # xgboost +nvidia-nccl-cu12==2.23.4 + # via xgboost +packaging==24.2 + # via + # matplotlib + # pytest + # statsmodels +pandas==2.2.3 + # via + # pyprophet (setup.py) + # statsmodels +patsy==1.0.1 + # via statsmodels +pillow==11.0.0 + # via matplotlib +pluggy==1.5.0 + # via pytest +py4j==0.10.9.7 + # via hyperopt +pyarrow==18.0.0 + # via pyprophet (setup.py) +pyparsing==3.2.0 + # via matplotlib +pypdf==5.1.0 + # via pyprophet (setup.py) +pytest==8.3.3 + # via + # pyprophet (setup.py) + # pytest-regtest +pytest-regtest==2.3.3 + # via pyprophet (setup.py) +python-dateutil==2.9.0.post0 + # via + # matplotlib + # pandas +pytz==2024.2 + # via pandas +scikit-learn==1.5.2 + # via pyprophet (setup.py) +scipy==1.13.1 + # via + # hyperopt + # pyprophet (setup.py) + # scikit-learn + # statsmodels + # xgboost +six==1.16.0 + # via + # hyperopt + # python-dateutil +statsmodels==0.14.4 + # via pyprophet (setup.py) +tabulate==0.9.0 + # via pyprophet (setup.py) +threadpoolctl==3.5.0 + # via scikit-learn +tqdm==4.67.0 + # via hyperopt +tzdata==2024.2 + # via pandas +xgboost==2.1.2 + # via pyprophet (setup.py) From ce075f312f7fec1bcb8b5759fbb4b3af4f6b0ba8 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 12:08:41 -0500 Subject: [PATCH 19/29] add dependabot --- .github/workflows/dependabot.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .github/workflows/dependabot.yml diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml new file mode 100644 index 0000000..7e6ee06 --- /dev/null +++ b/.github/workflows/dependabot.yml @@ -0,0 +1,9 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" # Location of your pyproject.toml or requirements.txt + schedule: + interval: "weekly" # Checks for updates every week + commit-message: + prefix: "deps" # Prefix for pull request titles + open-pull-requests-limit: 5 # Limit the number of open PRs at a time From a0235e04b56b3c76ea393d3cd492416b35e73c5f Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 12:46:12 -0500 Subject: [PATCH 20/29] add tests for windows and mac --- .github/workflows/ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 85a120e..37edb05 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,8 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest] - #os: [ubuntu-latest, windows-latest, macos-latest] # remove mac tests + os: [ubuntu-latest, windows-latest, macos-latest] # Requirements file generated with python=3.11 python-version: ["3.11"] steps: From 88291fa7941a95cbacfc2340e927819a923f0ee6 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 12:49:52 -0500 Subject: [PATCH 21/29] remove default_rng --- pyprophet/classifiers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyprophet/classifiers.py b/pyprophet/classifiers.py index 0aefd04..b23c96c 100644 --- a/pyprophet/classifiers.py +++ b/pyprophet/classifiers.py @@ -110,7 +110,6 @@ def objective(params): clf = xgb.XGBClassifier(random_state=42, verbosity=0, objective='binary:logitraw', eval_metric='auc', **params) - rng = np.random.default_rng(42) score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=42)).mean() # click.echo("Info: AUC: {:.3f} hyperparameters: {}".format(score, params)) return score From 08df7cde31f78f1e70c01d42a5534faa79d9c52d Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 12:53:56 -0500 Subject: [PATCH 22/29] remove mac tests --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37edb05..a3755dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-latest, windows-latest] # Requirements file generated with python=3.11 python-version: ["3.11"] steps: From 0c342e9fc59808c97d3cbf36efea61484506945e Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 13:03:58 -0500 Subject: [PATCH 23/29] remove copy of np arrays --- pyprophet/stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyprophet/stats.py b/pyprophet/stats.py index f037e27..752ca52 100644 --- a/pyprophet/stats.py +++ b/pyprophet/stats.py @@ -236,7 +236,7 @@ def pi0est(p_values, lambda_ = np.arange(0.05,1.0,0.05), pi0_method = "smoother" @profile def qvalue(p_values, pi0, pfdr = False): - p = np.array(p_values).copy() + p = np.array(p_values) qvals_out = p rm_na = np.isfinite(p) @@ -280,7 +280,7 @@ def bw_nrd0(x): @profile def lfdr(p_values, pi0, trunc = True, monotone = True, transf = "probit", adj = 1.5, eps = np.power(10.0,-8)): """ Estimate local FDR / posterior error probability from p-values according to bioconductor/qvalue """ - p = np.array(p_values).copy() + p = np.array(p_values) # Compare to bioconductor/qvalue reference implementation # import rpy2 From 527c365926a51fbc8f1a1d636453917da22ab456 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 13:05:18 -0500 Subject: [PATCH 24/29] remove windows tests currently both mac and windows tests are failing so just keep ubuntu --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a3755dc..29617bd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest] + os: [ubuntu-latest] # Requirements file generated with python=3.11 python-version: ["3.11"] steps: From 9c6466596be407542d50cc75b3f1f6f86d756362 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 13:38:19 -0500 Subject: [PATCH 25/29] refactor: new function for normlaizing score to decoys --- pyprophet/data_handling.py | 11 +++++++++++ pyprophet/semi_supervised.py | 6 +----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pyprophet/data_handling.py b/pyprophet/data_handling.py index a7c50a1..21e6953 100644 --- a/pyprophet/data_handling.py +++ b/pyprophet/data_handling.py @@ -5,6 +5,7 @@ import sys import os import multiprocessing +from sklearn.preprocessing import StandardScaler from .optimized import find_top_ranked, rank @@ -336,6 +337,16 @@ def get_top_target_peaks(self): def get_feature_matrix(self, use_main_score): min_col = 5 if use_main_score else 6 return self.df.iloc[:, min_col:-1].values + + def normalize_score_by_decoys(self, score_col_name): + ''' + normalize the decoy scores to mean 0 and std 1, scale the targets accordingly + Args: + score_col_name: str, the name of the score column + ''' + td_scores = self.get_top_decoy_peaks()[score_col_name] + transform = StandardScaler().fit(td_scores.values.reshape(-1, 1)) + self.df.loc[:, score_col_name] = transform.transform(self.df[score_col_name].values.reshape(-1, 1)) def filter_(self, idx): return Experiment(self.df[idx]) diff --git a/pyprophet/semi_supervised.py b/pyprophet/semi_supervised.py index 87b56cb..cbe1099 100644 --- a/pyprophet/semi_supervised.py +++ b/pyprophet/semi_supervised.py @@ -64,13 +64,9 @@ def learn_randomized(self, experiment, score_columns, working_thread_number): # after semi supervised iteration: classify full dataset clf_scores = self.score(experiment, params) - mu, nu = mean_and_std_dev(clf_scores) experiment.set_and_rerank("classifier_score", clf_scores) - td_scores = experiment.get_top_decoy_peaks()["classifier_score"] - - mu, nu = mean_and_std_dev(td_scores) - experiment["classifier_score"] = (experiment["classifier_score"] - mu) / nu + experiment.normalize_score_by_decoys('classifier_score') experiment.rank_by("classifier_score") top_test_peaks = experiment.get_top_test_peaks() From 9be0a9a5fa9d0c0cb175f1a4daa7cc845fd6f816 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 13:52:22 -0500 Subject: [PATCH 26/29] replace semi-supervised learning normalization with sklearn replace semi-supervised learning normalization with sklearn which can handle cases where std = 0 --- pyprophet/semi_supervised.py | 8 ++------ pyprophet/stats.py | 5 +---- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pyprophet/semi_supervised.py b/pyprophet/semi_supervised.py index cbe1099..7735131 100644 --- a/pyprophet/semi_supervised.py +++ b/pyprophet/semi_supervised.py @@ -3,7 +3,7 @@ from .data_handling import Experiment, update_chosen_main_score_in_table from .classifiers import AbstractLearner, XGBLearner -from .stats import mean_and_std_dev, find_cutoff +from .stats import find_cutoff try: profile @@ -88,13 +88,9 @@ def learn_final(self, experiment): # after semi supervised iteration: classify full dataset clf_scores = self.score(experiment, params) - mu, nu = mean_and_std_dev(clf_scores) experiment.set_and_rerank("classifier_score", clf_scores) - td_scores = experiment.get_top_decoy_peaks()["classifier_score"] - - mu, nu = mean_and_std_dev(td_scores) - experiment["classifier_score"] = (experiment["classifier_score"] - mu) / nu + experiment.normalize_score_by_decoys('classifier_score') experiment.rank_by("classifier_score") return params diff --git a/pyprophet/stats.py b/pyprophet/stats.py index 752ca52..6934e87 100644 --- a/pyprophet/stats.py +++ b/pyprophet/stats.py @@ -118,10 +118,7 @@ def posterior_chromatogram_hypotheses_fast(experiment, prior_chrom_null): def mean_and_std_dev(values): - std = np.std(values, ddof=1) - if std == 0: - raise RuntimeError("Computed standard deviation is 0, cannot perform normalization") - return np.mean(values), std + return np.mean(values), np.std(values, ddof=1) def pnorm(stat, stat0): From aa70dbef89dfd27ad554ceddbb51eac393bd1f21 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 14:00:48 -0500 Subject: [PATCH 27/29] revert to numpy std sklearn transofmration does not use degrees of freedom. Keep the normalize_score_by_decoys method but use numpy std() method --- pyprophet/data_handling.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pyprophet/data_handling.py b/pyprophet/data_handling.py index 21e6953..29c52b7 100644 --- a/pyprophet/data_handling.py +++ b/pyprophet/data_handling.py @@ -5,7 +5,7 @@ import sys import os import multiprocessing -from sklearn.preprocessing import StandardScaler +from .stats import mean_and_std_dev from .optimized import find_top_ranked, rank @@ -345,8 +345,13 @@ def normalize_score_by_decoys(self, score_col_name): score_col_name: str, the name of the score column ''' td_scores = self.get_top_decoy_peaks()[score_col_name] - transform = StandardScaler().fit(td_scores.values.reshape(-1, 1)) - self.df.loc[:, score_col_name] = transform.transform(self.df[score_col_name].values.reshape(-1, 1)) + mu, nu = mean_and_std_dev(td_scores) + + if nu == 0: + raise Exception("Warning: Standard deviation of decoy scores is zero. Cannot normalize scores.") + + self.df.loc[:, score_col_name] = (self.df[score_col_name] - mu) / nu + def filter_(self, idx): return Experiment(self.df[idx]) From 005af6c9c1e67fbb1a3acf976293aa29b824c6b4 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 21 Nov 2024 14:05:31 -0500 Subject: [PATCH 28/29] minor updates to pyprophet.toml --- pyproject.toml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 06fa77d..00f104d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyprophet" -version = "2.2.8" +version = "2.2.9" description = "PyProphet: Semi-supervised learning and scoring of OpenSWATH results." readme = { file = "README.md", content-type = "text/markdown" } license = { text = "BSD" } @@ -26,7 +26,7 @@ dependencies = [ "duckdb", "duckdb-extensions", "duckdb-extension-sqlite-scanner", - "numpy >= 1.9.0", + "numpy >= 2.0", "scipy", "pandas >= 0.17", "cython", @@ -41,6 +41,10 @@ dependencies = [ "pypdf" ] +# Optional dependencies +[project.optional-dependencies] +testing = ["pytest", "pytest-regtest"] + # Define console entry points [project.scripts] pyprophet = "pyprophet.main:cli" @@ -48,4 +52,4 @@ pyprophet = "pyprophet.main:cli" [tool.setuptools] packages = { find = { exclude = ["ez_setup", "examples", "tests"] } } include-package-data = true -zip-safe = false \ No newline at end of file +zip-safe = false From 81f78bf16d437d241852797cb6042f4cd8c8e5db Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 27 Nov 2024 10:47:30 -0500 Subject: [PATCH 29/29] fix: ValueError: Buffer dtype mismatch, expected 'DATA_TYPE' but got 'double' need to cast to np.float_32 --- .gitignore | 1 + pyprophet/data_handling.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f374cdc..d9a6697 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ nosetests.xml # vim *.sw[opqrs] +*~ diff --git a/pyprophet/data_handling.py b/pyprophet/data_handling.py index 29c52b7..23ddd65 100644 --- a/pyprophet/data_handling.py +++ b/pyprophet/data_handling.py @@ -360,7 +360,7 @@ def filter_(self, idx): def add_peak_group_rank(self): ids = self.df.tg_num_id.values scores = self.df.d_score.values - peak_group_ranks = rank(ids, scores) + peak_group_ranks = rank(ids, scores.astype(np.float32, copy=False)) self.df["peak_group_rank"] = peak_group_ranks @profile