Skip to content

Commit

Permalink
changes to test
Browse files Browse the repository at this point in the history
  • Loading branch information
avinashbarnwal committed Sep 24, 2019
1 parent a001cbf commit 53fa306
Show file tree
Hide file tree
Showing 11 changed files with 461 additions and 15 deletions.
5 changes: 5 additions & 0 deletions build/lib/stepwisereg/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

# -*- coding: utf-8 -*-

from .stepwisereg import *

72 changes: 72 additions & 0 deletions build/lib/stepwisereg/stepwisereg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np
import warnings
import os
import statsmodels.formula.api as smf
import pandas as pd
import functools
import re
warnings.filterwarnings('ignore')


class stepwise:

def __init__(self,step,fit_intercept):
self.step = step
self.fit_intercept = fit_intercept

def reduce_concat(self,x, sep=""):
return functools.reduce(lambda x, y: str(x) + sep + str(y), x)

def fit(self,data,null_formula,full_formula,response):

"""Linear model designed by forward selection.
Parameters:
-----------
data : pandas DataFrame with all possible predictors and response
response: string, name of response column in data
Returns:
--------
model: an "optimal" fitted statsmodels linear model
with an intercept
selected by forward selection
evaluated by aic
"""

null_temp = re.split('~',null_formula)
null_predic_com = null_temp[1].split('+')
null_predic = null_predic_com[1:len(null_predic_com)]
full_temp = re.split('~',full_formula)
full_predic_com = full_temp[1].split('+')
full_predic = full_predic_com[1:len(full_predic_com)]
indices = [i for i,id in enumerate(full_predic) if id not in null_predic]
domain = [full_predic[i] for i in indices]
start = set(null_predic)
remaining = set(domain)
selected = null_predic
current_score, best_new_score = float('inf'), float('inf')
score_selected = []
variable_added = []

while (remaining and current_score == best_new_score and self.step >0):
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {}".format(response,' + '.join(selected + [candidate]))
if self.fit_intercept == 0:
formula = formula + "-1"
score = smf.ols(formula, data).fit().aic
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates.pop(0)
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
score_selected.append(best_new_score)
variable_added.append(best_candidate)
current_score = best_new_score
self.step=self.step-1
formula = "{} ~ {}".format(response,' + '.join(selected))
if self.fit_intercept == 0:
formula = formula + "-1"
model = smf.ols(formula, data).fit()
return model

1 change: 1 addition & 0 deletions stepwisereg/.ipynb_checkpoints/__init__-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
name = "stepwisereg"
72 changes: 72 additions & 0 deletions stepwisereg/.ipynb_checkpoints/stepwisereg-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np
import warnings
import os
import statsmodels.formula.api as smf
import pandas as pd
import functools
import re
warnings.filterwarnings('ignore')


class stepwise:

def __init__(self,step,fit_intercept):
self.step = step
self.fit_intercept = fit_intercept

def reduce_concat(self,x, sep=""):
return functools.reduce(lambda x, y: str(x) + sep + str(y), x)

def fit(self,data,null_formula,full_formula,response):

"""Linear model designed by forward selection.
Parameters:
-----------
data : pandas DataFrame with all possible predictors and response
response: string, name of response column in data
Returns:
--------
model: an "optimal" fitted statsmodels linear model
with an intercept
selected by forward selection
evaluated by aic
"""

null_temp = re.split('~',null_formula)
null_predic_com = null_temp[1].split('+')
null_predic = null_predic_com[1:len(null_predic_com)]
full_temp = re.split('~',full_formula)
full_predic_com = full_temp[1].split('+')
full_predic = full_predic_com[1:len(full_predic_com)]
indices = [i for i,id in enumerate(full_predic) if id not in null_predic]
domain = [full_predic[i] for i in indices]
start = set(null_predic)
remaining = set(domain)
selected = null_predic
current_score, best_new_score = float('inf'), float('inf')
score_selected = []
variable_added = []

while (remaining and current_score == best_new_score and self.step >0):
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {}".format(response,' + '.join(selected + [candidate]))
if self.fit_intercept == 0:
formula = formula + "-1"
score = smf.ols(formula, data).fit().aic
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates.pop(0)
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
score_selected.append(best_new_score)
variable_added.append(best_candidate)
current_score = best_new_score
self.step=self.step-1
formula = "{} ~ {}".format(response,' + '.join(selected))
if self.fit_intercept == 0:
formula = formula + "-1"
model = smf.ols(formula, data).fit()
return model

6 changes: 5 additions & 1 deletion stepwisereg/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
name = "stepwisereg"

# -*- coding: utf-8 -*-

from .stepwisereg import *

15 changes: 4 additions & 11 deletions stepwisereg/stepwisereg.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,14 @@
import numpy as np
import sklearn
import warnings
import os
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
from patsy import dmatrices
import statsmodels.formula.api as smf
import pandas as pd
import functools
import re
warnings.filterwarnings('ignore')


# In[10]:

class stepwisereg:
class stepwise:

def __init__(self,step,fit_intercept):
self.step = step
Expand Down Expand Up @@ -53,8 +46,8 @@ def fit(self,data,null_formula,full_formula,response):
current_score, best_new_score = float('inf'), float('inf')
score_selected = []
variable_added = []

while (remaining and current_score == best_new_score and step >0):
while (remaining and current_score == best_new_score and self.step >0):
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {}".format(response,' + '.join(selected + [candidate]))
Expand All @@ -70,7 +63,7 @@ def fit(self,data,null_formula,full_formula,response):
score_selected.append(best_new_score)
variable_added.append(best_candidate)
current_score = best_new_score
step=step-1
self.step=self.step-1
formula = "{} ~ {}".format(response,' + '.join(selected))
if self.fit_intercept == 0:
formula = formula + "-1"
Expand Down
126 changes: 126 additions & 0 deletions stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import stepwisereg\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"\n",
"data = pd.read_csv(\"lung_cancer.csv\")\n",
"###########Train Dataset and Test Dataset Creation########\n",
"\n",
"msk = np.random.rand(len(data)) < 0.8\n",
"train = data[msk]\n",
"test = data[~msk]\n",
"\n",
"#########Independent Variables are from 2:102 and 106 has Dependent Variable########\n",
"X_train = data.iloc[:,2:102]\n",
"Y_train = 10*data.iloc[:,106]\n",
"\n",
"###########Changes in the name of columns######\n",
"columns = list(X_train.columns.values)\n",
"columns_changes = list(map(lambda x:x.replace(\"-\", \"_\"),columns))\n",
"X_train.columns = columns_changes\n",
"train = pd.concat([X_train,Y_train],axis=1)\n",
"\n",
"X_test = test.iloc[:,2:102]\n",
"Y_test = 10*test.iloc[:,106]\n",
"\n",
"columns_test = list(X_test.columns.values)\n",
"columns_changes_test = map(lambda x:x.replace(\"-\", \"_\"),columns_test)\n",
"X_test.columns = columns_changes_test\n",
"test = pd.concat([X_test,Y_test],axis=1)\n",
"\n",
"##Creating the features concatenation\n",
"features = \"+\".join(columns_changes)\n",
"\n",
"##Creating Null and Full formula\n",
"var1 = columns_changes[0] \n",
"null = 'OS_MONTHS ~' + var1\n",
"full = 'OS_MONTHS ~' + features"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: OS_MONTHS R-squared: 0.296\n",
"Model: OLS Adj. R-squared: 0.286\n",
"Method: Least Squares F-statistic: 28.99\n",
"Date: Tue, 24 Sep 2019 Prob (F-statistic): 1.49e-24\n",
"Time: 18:04:02 Log-Likelihood: -1244.2\n",
"No. Observations: 351 AIC: 2500.\n",
"Df Residuals: 345 BIC: 2524.\n",
"Df Model: 5 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"Intercept 34.8287 4.693 7.421 0.000 25.598 44.060\n",
"FAM159B 10.8594 1.964 5.530 0.000 6.997 14.722\n",
"EXOSC6 10.2559 1.915 5.357 0.000 6.490 14.022\n",
"PRORY 16.5512 2.959 5.594 0.000 10.732 22.371\n",
"ASAP3 5.0476 1.119 4.509 0.000 2.846 7.249\n",
"ACSM5 12.5162 2.816 4.445 0.000 6.978 18.054\n",
"==============================================================================\n",
"Omnibus: 95.928 Durbin-Watson: 2.010\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 253.602\n",
"Skew: 1.291 Prob(JB): 8.53e-56\n",
"Kurtosis: 6.267 Cond. No. 25.4\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"model = stepwisereg.stepwise(5,1)\n",
"model_fit = model.fit(train,null,full,'OS_MONTHS')\n",
"model_param = model_fit.params\n",
"test_predict = model_fit.predict(test)\n",
"print(model_fit.summary())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
44 changes: 44 additions & 0 deletions stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd
import stepwisereg
import numpy as np



data = pd.read_csv("lung_cancer.csv")
###########Train Dataset and Test Dataset Creation########

msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]

#########Independent Variables are from 2:102 and 106 has Dependent Variable########
X_train = data.iloc[:,2:102]
Y_train = 10*data.iloc[:,106]

###########Changes in the name of columns######
columns = list(X_train.columns.values)
columns_changes = list(map(lambda x:x.replace("-", "_"),columns))
X_train.columns = columns_changes
train = pd.concat([X_train,Y_train],axis=1)

X_test = test.iloc[:,2:102]
Y_test = 10*test.iloc[:,106]

columns_test = list(X_test.columns.values)
columns_changes_test = map(lambda x:x.replace("-", "_"),columns_test)
X_test.columns = columns_changes_test
test = pd.concat([X_test,Y_test],axis=1)

##Creating the features concatenation
features = "+".join(columns_changes)

##Creating Null and Full formula
var1 = columns_changes[0]
null = 'OS_MONTHS ~' + var1
full = 'OS_MONTHS ~' + features

model = stepwisereg.stepwise(5,1)
model_fit = model.fit(train,null,full,'OS_MONTHS')
model_param = model_fit.params
test_predict = model_fit.predict(test)
print(model_fit.summary())
File renamed without changes.
Loading

0 comments on commit 53fa306

Please sign in to comment.