-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a001cbf
commit 53fa306
Showing
11 changed files
with
461 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
|
||
# -*- coding: utf-8 -*- | ||
|
||
from .stepwisereg import * | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import numpy as np | ||
import warnings | ||
import os | ||
import statsmodels.formula.api as smf | ||
import pandas as pd | ||
import functools | ||
import re | ||
warnings.filterwarnings('ignore') | ||
|
||
|
||
class stepwise: | ||
|
||
def __init__(self,step,fit_intercept): | ||
self.step = step | ||
self.fit_intercept = fit_intercept | ||
|
||
def reduce_concat(self,x, sep=""): | ||
return functools.reduce(lambda x, y: str(x) + sep + str(y), x) | ||
|
||
def fit(self,data,null_formula,full_formula,response): | ||
|
||
"""Linear model designed by forward selection. | ||
Parameters: | ||
----------- | ||
data : pandas DataFrame with all possible predictors and response | ||
response: string, name of response column in data | ||
Returns: | ||
-------- | ||
model: an "optimal" fitted statsmodels linear model | ||
with an intercept | ||
selected by forward selection | ||
evaluated by aic | ||
""" | ||
|
||
null_temp = re.split('~',null_formula) | ||
null_predic_com = null_temp[1].split('+') | ||
null_predic = null_predic_com[1:len(null_predic_com)] | ||
full_temp = re.split('~',full_formula) | ||
full_predic_com = full_temp[1].split('+') | ||
full_predic = full_predic_com[1:len(full_predic_com)] | ||
indices = [i for i,id in enumerate(full_predic) if id not in null_predic] | ||
domain = [full_predic[i] for i in indices] | ||
start = set(null_predic) | ||
remaining = set(domain) | ||
selected = null_predic | ||
current_score, best_new_score = float('inf'), float('inf') | ||
score_selected = [] | ||
variable_added = [] | ||
|
||
while (remaining and current_score == best_new_score and self.step >0): | ||
scores_with_candidates = [] | ||
for candidate in remaining: | ||
formula = "{} ~ {}".format(response,' + '.join(selected + [candidate])) | ||
if self.fit_intercept == 0: | ||
formula = formula + "-1" | ||
score = smf.ols(formula, data).fit().aic | ||
scores_with_candidates.append((score, candidate)) | ||
scores_with_candidates.sort() | ||
best_new_score, best_candidate = scores_with_candidates.pop(0) | ||
if current_score > best_new_score: | ||
remaining.remove(best_candidate) | ||
selected.append(best_candidate) | ||
score_selected.append(best_new_score) | ||
variable_added.append(best_candidate) | ||
current_score = best_new_score | ||
self.step=self.step-1 | ||
formula = "{} ~ {}".format(response,' + '.join(selected)) | ||
if self.fit_intercept == 0: | ||
formula = formula + "-1" | ||
model = smf.ols(formula, data).fit() | ||
return model | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
name = "stepwisereg" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import numpy as np | ||
import warnings | ||
import os | ||
import statsmodels.formula.api as smf | ||
import pandas as pd | ||
import functools | ||
import re | ||
warnings.filterwarnings('ignore') | ||
|
||
|
||
class stepwise: | ||
|
||
def __init__(self,step,fit_intercept): | ||
self.step = step | ||
self.fit_intercept = fit_intercept | ||
|
||
def reduce_concat(self,x, sep=""): | ||
return functools.reduce(lambda x, y: str(x) + sep + str(y), x) | ||
|
||
def fit(self,data,null_formula,full_formula,response): | ||
|
||
"""Linear model designed by forward selection. | ||
Parameters: | ||
----------- | ||
data : pandas DataFrame with all possible predictors and response | ||
response: string, name of response column in data | ||
Returns: | ||
-------- | ||
model: an "optimal" fitted statsmodels linear model | ||
with an intercept | ||
selected by forward selection | ||
evaluated by aic | ||
""" | ||
|
||
null_temp = re.split('~',null_formula) | ||
null_predic_com = null_temp[1].split('+') | ||
null_predic = null_predic_com[1:len(null_predic_com)] | ||
full_temp = re.split('~',full_formula) | ||
full_predic_com = full_temp[1].split('+') | ||
full_predic = full_predic_com[1:len(full_predic_com)] | ||
indices = [i for i,id in enumerate(full_predic) if id not in null_predic] | ||
domain = [full_predic[i] for i in indices] | ||
start = set(null_predic) | ||
remaining = set(domain) | ||
selected = null_predic | ||
current_score, best_new_score = float('inf'), float('inf') | ||
score_selected = [] | ||
variable_added = [] | ||
|
||
while (remaining and current_score == best_new_score and self.step >0): | ||
scores_with_candidates = [] | ||
for candidate in remaining: | ||
formula = "{} ~ {}".format(response,' + '.join(selected + [candidate])) | ||
if self.fit_intercept == 0: | ||
formula = formula + "-1" | ||
score = smf.ols(formula, data).fit().aic | ||
scores_with_candidates.append((score, candidate)) | ||
scores_with_candidates.sort() | ||
best_new_score, best_candidate = scores_with_candidates.pop(0) | ||
if current_score > best_new_score: | ||
remaining.remove(best_candidate) | ||
selected.append(best_candidate) | ||
score_selected.append(best_new_score) | ||
variable_added.append(best_candidate) | ||
current_score = best_new_score | ||
self.step=self.step-1 | ||
formula = "{} ~ {}".format(response,' + '.join(selected)) | ||
if self.fit_intercept == 0: | ||
formula = formula + "-1" | ||
model = smf.ols(formula, data).fit() | ||
return model | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,5 @@ | ||
name = "stepwisereg" | ||
|
||
# -*- coding: utf-8 -*- | ||
|
||
from .stepwisereg import * | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
126 changes: 126 additions & 0 deletions
126
stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import stepwisereg\n", | ||
"import numpy as np" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"data = pd.read_csv(\"lung_cancer.csv\")\n", | ||
"###########Train Dataset and Test Dataset Creation########\n", | ||
"\n", | ||
"msk = np.random.rand(len(data)) < 0.8\n", | ||
"train = data[msk]\n", | ||
"test = data[~msk]\n", | ||
"\n", | ||
"#########Independent Variables are from 2:102 and 106 has Dependent Variable########\n", | ||
"X_train = data.iloc[:,2:102]\n", | ||
"Y_train = 10*data.iloc[:,106]\n", | ||
"\n", | ||
"###########Changes in the name of columns######\n", | ||
"columns = list(X_train.columns.values)\n", | ||
"columns_changes = list(map(lambda x:x.replace(\"-\", \"_\"),columns))\n", | ||
"X_train.columns = columns_changes\n", | ||
"train = pd.concat([X_train,Y_train],axis=1)\n", | ||
"\n", | ||
"X_test = test.iloc[:,2:102]\n", | ||
"Y_test = 10*test.iloc[:,106]\n", | ||
"\n", | ||
"columns_test = list(X_test.columns.values)\n", | ||
"columns_changes_test = map(lambda x:x.replace(\"-\", \"_\"),columns_test)\n", | ||
"X_test.columns = columns_changes_test\n", | ||
"test = pd.concat([X_test,Y_test],axis=1)\n", | ||
"\n", | ||
"##Creating the features concatenation\n", | ||
"features = \"+\".join(columns_changes)\n", | ||
"\n", | ||
"##Creating Null and Full formula\n", | ||
"var1 = columns_changes[0] \n", | ||
"null = 'OS_MONTHS ~' + var1\n", | ||
"full = 'OS_MONTHS ~' + features" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" OLS Regression Results \n", | ||
"==============================================================================\n", | ||
"Dep. Variable: OS_MONTHS R-squared: 0.296\n", | ||
"Model: OLS Adj. R-squared: 0.286\n", | ||
"Method: Least Squares F-statistic: 28.99\n", | ||
"Date: Tue, 24 Sep 2019 Prob (F-statistic): 1.49e-24\n", | ||
"Time: 18:04:02 Log-Likelihood: -1244.2\n", | ||
"No. Observations: 351 AIC: 2500.\n", | ||
"Df Residuals: 345 BIC: 2524.\n", | ||
"Df Model: 5 \n", | ||
"Covariance Type: nonrobust \n", | ||
"==============================================================================\n", | ||
" coef std err t P>|t| [0.025 0.975]\n", | ||
"------------------------------------------------------------------------------\n", | ||
"Intercept 34.8287 4.693 7.421 0.000 25.598 44.060\n", | ||
"FAM159B 10.8594 1.964 5.530 0.000 6.997 14.722\n", | ||
"EXOSC6 10.2559 1.915 5.357 0.000 6.490 14.022\n", | ||
"PRORY 16.5512 2.959 5.594 0.000 10.732 22.371\n", | ||
"ASAP3 5.0476 1.119 4.509 0.000 2.846 7.249\n", | ||
"ACSM5 12.5162 2.816 4.445 0.000 6.978 18.054\n", | ||
"==============================================================================\n", | ||
"Omnibus: 95.928 Durbin-Watson: 2.010\n", | ||
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 253.602\n", | ||
"Skew: 1.291 Prob(JB): 8.53e-56\n", | ||
"Kurtosis: 6.267 Cond. No. 25.4\n", | ||
"==============================================================================\n", | ||
"\n", | ||
"Warnings:\n", | ||
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"model = stepwisereg.stepwise(5,1)\n", | ||
"model_fit = model.fit(train,null,full,'OS_MONTHS')\n", | ||
"model_param = model_fit.params\n", | ||
"test_predict = model_fit.predict(test)\n", | ||
"print(model_fit.summary())" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
44 changes: 44 additions & 0 deletions
44
stepwisereg/test/.ipynb_checkpoints/lusc_lung_cancer-checkpoint.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import pandas as pd | ||
import stepwisereg | ||
import numpy as np | ||
|
||
|
||
|
||
data = pd.read_csv("lung_cancer.csv") | ||
###########Train Dataset and Test Dataset Creation######## | ||
|
||
msk = np.random.rand(len(data)) < 0.8 | ||
train = data[msk] | ||
test = data[~msk] | ||
|
||
#########Independent Variables are from 2:102 and 106 has Dependent Variable######## | ||
X_train = data.iloc[:,2:102] | ||
Y_train = 10*data.iloc[:,106] | ||
|
||
###########Changes in the name of columns###### | ||
columns = list(X_train.columns.values) | ||
columns_changes = list(map(lambda x:x.replace("-", "_"),columns)) | ||
X_train.columns = columns_changes | ||
train = pd.concat([X_train,Y_train],axis=1) | ||
|
||
X_test = test.iloc[:,2:102] | ||
Y_test = 10*test.iloc[:,106] | ||
|
||
columns_test = list(X_test.columns.values) | ||
columns_changes_test = map(lambda x:x.replace("-", "_"),columns_test) | ||
X_test.columns = columns_changes_test | ||
test = pd.concat([X_test,Y_test],axis=1) | ||
|
||
##Creating the features concatenation | ||
features = "+".join(columns_changes) | ||
|
||
##Creating Null and Full formula | ||
var1 = columns_changes[0] | ||
null = 'OS_MONTHS ~' + var1 | ||
full = 'OS_MONTHS ~' + features | ||
|
||
model = stepwisereg.stepwise(5,1) | ||
model_fit = model.fit(train,null,full,'OS_MONTHS') | ||
model_param = model_fit.params | ||
test_predict = model_fit.predict(test) | ||
print(model_fit.summary()) |
File renamed without changes.
Oops, something went wrong.