changes to test

avinashbarnwal · Sep 24, 2019 · 53fa306 · 53fa306
1 parent a001cbf
commit 53fa306
Show file tree

Hide file tree

Showing 11 changed files with 461 additions and 15 deletions.
diff --git a/build/lib/stepwisereg/__init__.py b/build/lib/stepwisereg/__init__.py
@@ -0,0 +1,5 @@
+
+# -*- coding: utf-8 -*-
+
+from .stepwisereg import *
+
diff --git a/build/lib/stepwisereg/stepwisereg.py b/build/lib/stepwisereg/stepwisereg.py
@@ -0,0 +1,72 @@
+import numpy as np
+import warnings
+import os
+import statsmodels.formula.api as smf
+import pandas as pd
+import functools
+import re
+warnings.filterwarnings('ignore')
+
+
+class stepwise:
+
+    def __init__(self,step,fit_intercept):
+        self.step = step
+        self.fit_intercept = fit_intercept
+
+    def reduce_concat(self,x, sep=""):
+        return functools.reduce(lambda x, y: str(x) + sep + str(y), x)
+
+    def fit(self,data,null_formula,full_formula,response):
+
+        """Linear model designed by forward selection.
+        Parameters:
+        -----------
+        data : pandas DataFrame with all possible predictors and response
+        response: string, name of response column in data
+        Returns:
+        --------
+        model: an "optimal" fitted statsmodels linear model
+               with an intercept
+               selected by forward selection
+               evaluated by aic
+        """
+
+        null_temp        = re.split('~',null_formula)
+        null_predic_com  = null_temp[1].split('+')
+        null_predic      = null_predic_com[1:len(null_predic_com)]
+        full_temp        = re.split('~',full_formula)
+        full_predic_com  = full_temp[1].split('+')
+        full_predic      = full_predic_com[1:len(full_predic_com)]
+        indices          = [i for i,id in enumerate(full_predic) if id not in null_predic]
+        domain           = [full_predic[i] for i in indices]
+        start            = set(null_predic)
+        remaining        = set(domain)
+        selected         = null_predic
+        current_score, best_new_score = float('inf'), float('inf')
+        score_selected   = []
+        variable_added   = []
+
+        while (remaining and current_score == best_new_score and self.step >0):
+            scores_with_candidates = []
+            for candidate in remaining:
+                formula = "{} ~ {}".format(response,' + '.join(selected + [candidate]))
+                if self.fit_intercept == 0:
+                    formula = formula + "-1"
+                score = smf.ols(formula, data).fit().aic
+                scores_with_candidates.append((score, candidate))
+            scores_with_candidates.sort()
+            best_new_score, best_candidate = scores_with_candidates.pop(0)
+            if current_score > best_new_score:
+                remaining.remove(best_candidate)
+                selected.append(best_candidate)
+                score_selected.append(best_new_score)
+                variable_added.append(best_candidate)
+                current_score = best_new_score
+            self.step=self.step-1
+        formula = "{} ~ {}".format(response,' + '.join(selected))
+        if self.fit_intercept == 0:
+            formula = formula + "-1"
+        model = smf.ols(formula, data).fit()
+        return model
+
diff --git a/stepwisereg/.ipynb_checkpoints/__init__-checkpoint.py b/stepwisereg/.ipynb_checkpoints/__init__-checkpoint.py
@@ -0,0 +1 @@
+name = "stepwisereg"
diff --git a/stepwisereg/.ipynb_checkpoints/stepwisereg-checkpoint.py b/stepwisereg/.ipynb_checkpoints/stepwisereg-checkpoint.py
@@ -0,0 +1,72 @@
+import numpy as np
+import warnings
+import os
+import statsmodels.formula.api as smf
+import pandas as pd
+import functools
+import re
+warnings.filterwarnings('ignore')
+
+
+class stepwise:
+
+    def __init__(self,step,fit_intercept):
+        self.step = step
+        self.fit_intercept = fit_intercept
+
+    def reduce_concat(self,x, sep=""):
+        return functools.reduce(lambda x, y: str(x) + sep + str(y), x)
+
+    def fit(self,data,null_formula,full_formula,response):
+
+        """Linear model designed by forward selection.
+        Parameters:
+        -----------
+        data : pandas DataFrame with all possible predictors and response
+        response: string, name of response column in data
+        Returns:
+        --------
+        model: an "optimal" fitted statsmodels linear model
+               with an intercept
+               selected by forward selection
+               evaluated by aic
+        """
+
+        null_temp        = re.split('~',null_formula)
+        null_predic_com  = null_temp[1].split('+')
+        null_predic      = null_predic_com[1:len(null_predic_com)]
+        full_temp        = re.split('~',full_formula)
+        full_predic_com  = full_temp[1].split('+')
+        full_predic      = full_predic_com[1:len(full_predic_com)]
+        indices          = [i for i,id in enumerate(full_predic) if id not in null_predic]
+        domain           = [full_predic[i] for i in indices]
+        start            = set(null_predic)
+        remaining        = set(domain)
+        selected         = null_predic
+        current_score, best_new_score = float('inf'), float('inf')
+        score_selected   = []
+        variable_added   = []
+
+        while (remaining and current_score == best_new_score and self.step >0):
+            scores_with_candidates = []
+            for candidate in remaining:
+                formula = "{} ~ {}".format(response,' + '.join(selected + [candidate]))
+                if self.fit_intercept == 0:
+                    formula = formula + "-1"
+                score = smf.ols(formula, data).fit().aic
+                scores_with_candidates.append((score, candidate))
+            scores_with_candidates.sort()
+            best_new_score, best_candidate = scores_with_candidates.pop(0)
+            if current_score > best_new_score:
+                remaining.remove(best_candidate)
+                selected.append(best_candidate)
+                score_selected.append(best_new_score)
+                variable_added.append(best_candidate)
+                current_score = best_new_score
+            self.step=self.step-1
+        formula = "{} ~ {}".format(response,' + '.join(selected))
+        if self.fit_intercept == 0:
+            formula = formula + "-1"
+        model = smf.ols(formula, data).fit()
+        return model
+
diff --git a/stepwisereg/__init__.py b/stepwisereg/__init__.py
@@ -1 +1,5 @@
-name = "stepwisereg"
+
+# -*- coding: utf-8 -*-
+
+from .stepwisereg import *
+
diff --git a/stepwisereg/stepwisereg.py b/stepwisereg/stepwisereg.py
@@ -1,21 +1,14 @@
 import numpy as np
-import sklearn
 import warnings
 import os
-from   sklearn import datasets, linear_model
-from   sklearn.metrics import mean_squared_error, r2_score
-from   sklearn.datasets import make_regression
-from   patsy import dmatrices
 import statsmodels.formula.api as smf
 import pandas as pd
 import functools
 import re
 warnings.filterwarnings('ignore')
 
 
-# In[10]:
-
-class stepwisereg:
+class stepwise:
 
     def __init__(self,step,fit_intercept):
         self.step = step
@@ -53,8 +46,8 @@ def fit(self,data,null_formula,full_formula,response):
         current_score, best_new_score = float('inf'), float('inf')
         score_selected   = []
         variable_added   = []
-
-        while (remaining and current_score == best_new_score and step >0):
+        
+        while (remaining and current_score == best_new_score and self.step >0):
             scores_with_candidates = []
             for candidate in remaining:
                 formula = "{} ~ {}".format(response,' + '.join(selected + [candidate]))
@@ -70,7 +63,7 @@ def fit(self,data,null_formula,full_formula,response):
                 score_selected.append(best_new_score)
                 variable_added.append(best_candidate)
                 current_score = best_new_score
-            step=step-1
+            self.step=self.step-1
         formula = "{} ~ {}".format(response,' + '.join(selected))
         if self.fit_intercept == 0:
             formula = formula + "-1"

diff --git a/stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.ipynb b/stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import stepwisereg\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "data = pd.read_csv(\"lung_cancer.csv\")\n",
+    "###########Train Dataset and Test Dataset Creation########\n",
+    "\n",
+    "msk = np.random.rand(len(data)) < 0.8\n",
+    "train = data[msk]\n",
+    "test  = data[~msk]\n",
+    "\n",
+    "#########Independent Variables are from 2:102 and 106 has Dependent Variable########\n",
+    "X_train = data.iloc[:,2:102]\n",
+    "Y_train = 10*data.iloc[:,106]\n",
+    "\n",
+    "###########Changes in the name of columns######\n",
+    "columns         = list(X_train.columns.values)\n",
+    "columns_changes = list(map(lambda x:x.replace(\"-\", \"_\"),columns))\n",
+    "X_train.columns = columns_changes\n",
+    "train = pd.concat([X_train,Y_train],axis=1)\n",
+    "\n",
+    "X_test = test.iloc[:,2:102]\n",
+    "Y_test = 10*test.iloc[:,106]\n",
+    "\n",
+    "columns_test         = list(X_test.columns.values)\n",
+    "columns_changes_test = map(lambda x:x.replace(\"-\", \"_\"),columns_test)\n",
+    "X_test.columns       = columns_changes_test\n",
+    "test = pd.concat([X_test,Y_test],axis=1)\n",
+    "\n",
+    "##Creating the features concatenation\n",
+    "features = \"+\".join(columns_changes)\n",
+    "\n",
+    "##Creating Null and Full formula\n",
+    "var1 = columns_changes[0]  \n",
+    "null = 'OS_MONTHS ~' + var1\n",
+    "full = 'OS_MONTHS ~' + features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                            OLS Regression Results                            \n",
+      "==============================================================================\n",
+      "Dep. Variable:              OS_MONTHS   R-squared:                       0.296\n",
+      "Model:                            OLS   Adj. R-squared:                  0.286\n",
+      "Method:                 Least Squares   F-statistic:                     28.99\n",
+      "Date:                Tue, 24 Sep 2019   Prob (F-statistic):           1.49e-24\n",
+      "Time:                        18:04:02   Log-Likelihood:                -1244.2\n",
+      "No. Observations:                 351   AIC:                             2500.\n",
+      "Df Residuals:                     345   BIC:                             2524.\n",
+      "Df Model:                           5                                         \n",
+      "Covariance Type:            nonrobust                                         \n",
+      "==============================================================================\n",
+      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
+      "------------------------------------------------------------------------------\n",
+      "Intercept     34.8287      4.693      7.421      0.000      25.598      44.060\n",
+      "FAM159B       10.8594      1.964      5.530      0.000       6.997      14.722\n",
+      "EXOSC6        10.2559      1.915      5.357      0.000       6.490      14.022\n",
+      "PRORY         16.5512      2.959      5.594      0.000      10.732      22.371\n",
+      "ASAP3          5.0476      1.119      4.509      0.000       2.846       7.249\n",
+      "ACSM5         12.5162      2.816      4.445      0.000       6.978      18.054\n",
+      "==============================================================================\n",
+      "Omnibus:                       95.928   Durbin-Watson:                   2.010\n",
+      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              253.602\n",
+      "Skew:                           1.291   Prob(JB):                     8.53e-56\n",
+      "Kurtosis:                       6.267   Cond. No.                         25.4\n",
+      "==============================================================================\n",
+      "\n",
+      "Warnings:\n",
+      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model           = stepwisereg.stepwise(5,1)\n",
+    "model_fit       = model.fit(train,null,full,'OS_MONTHS')\n",
+    "model_param     = model_fit.params\n",
+    "test_predict    = model_fit.predict(test)\n",
+    "print(model_fit.summary())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.py b/stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import stepwisereg
+import numpy as np
+
+
+
+data = pd.read_csv("lung_cancer.csv")
+###########Train Dataset and Test Dataset Creation########
+
+msk = np.random.rand(len(data)) < 0.8
+train = data[msk]
+test  = data[~msk]
+
+#########Independent Variables are from 2:102 and 106 has Dependent Variable########
+X_train = data.iloc[:,2:102]
+Y_train = 10*data.iloc[:,106]
+
+###########Changes in the name of columns######
+columns         = list(X_train.columns.values)
+columns_changes = list(map(lambda x:x.replace("-", "_"),columns))
+X_train.columns = columns_changes
+train = pd.concat([X_train,Y_train],axis=1)
+
+X_test = test.iloc[:,2:102]
+Y_test = 10*test.iloc[:,106]
+
+columns_test         = list(X_test.columns.values)
+columns_changes_test = map(lambda x:x.replace("-", "_"),columns_test)
+X_test.columns       = columns_changes_test
+test = pd.concat([X_test,Y_test],axis=1)
+
+##Creating the features concatenation
+features = "+".join(columns_changes)
+
+##Creating Null and Full formula
+var1 = columns_changes[0]  
+null = 'OS_MONTHS ~' + var1
+full = 'OS_MONTHS ~' + features
+
+model           = stepwisereg.stepwise(5,1)
+model_fit       = model.fit(train,null,full,'OS_MONTHS')
+model_param     = model_fit.params
+test_predict    = model_fit.predict(test)
+print(model_fit.summary())
diff --git a/stepwisereg/example/lung_cancer.csv → stepwisereg/test/lung_cancer.csv b/stepwisereg/example/lung_cancer.csv → stepwisereg/test/lung_cancer.csv
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,5 @@

		# -- coding: utf-8 --

		from .stepwisereg import *