-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_analysis_pandas_version.py
202 lines (162 loc) · 9.26 KB
/
main_analysis_pandas_version.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""
filename: main_analysis_pandas_version.py
description:
inputs:
outputs:
notes/to-do's:
- TODO 9/23/2013:
- why are the coefficient sizes so big???
- probably because some columns are constant?! so when there's no variation, can multiply even by a huge constant
and there's little effect?...
- insert code to remove missing values!!!
- specifically, the lower/higher ranges
- speed up code
- can be done by creating a design matrix with one column (the first) missing (for the DV) in the block of code
before the "for each DV" if not earlier
- PROBABLY NEED TO CONVER THE WHOLE THING TO A PANEL OF DATAFRAMES
- figure out how to run multinomial logit?!
- figure out a better way to extract the right p-values and coefficients from the results
- since the categoricals aren't centered, if the coefficients are of different sizes right now, that may be due simply
to there being more "years possible".. if categorical coeffs tend to be of different sizes than cardinals..
- Need to run the apprporiate models for different variable types
- For binary dependent, do logistic
- Split categorical IVs into dummies
- Don't need to make any changes to ordinal IVs?
- Coefficient sizes
- JE wants to center the variables and normalize them, to see coefficient size
Created on Sun Sep 01 15:55:48 2013
@author: Misha
"""
from __future__ import division
import cPickle as cp
import pandas as pd
import savReaderWriter as srw
import numpy as np
import statsmodels.formula.api as smf
import random, sys
# global variables
# i am taking out year 1984 for now because i don't have variables data on it! need to log in to commander.uchicago.edu
# and create a text file from variable view from that year's GSS...
GSS_YEARS = [1972, 1973, 1974, 1975, 1976, 1977, 1978,
1980, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
1990, 1991, 1993, 1994, 1996, 1998,
2000, 2002, 2004, 2006, 2008, 2010, 2012]
# LOAD FILES ########################################################################
sys.path.append('../Code/')
from articleClass import *
pathToData = '../Data/'
ALL_VARIABLE_NAMES = cp.load(open(pathToData + 'ALL_VARIABLE_NAMES.pickle'))
ALL_VARIABLE_NAMES = [str.upper(el) for el in ALL_VARIABLE_NAMES]
MISSING_VALUES_DICT = cp.load(open(pathToData + 'MISSING_VALUES_DICT.pickle', 'rb'))
MEASURE_LEVELS = cp.load(open(pathToData + 'MEASURE_LEVELS.pickle'))
articleIDAndGSSYearsUsed = cp.load(open(pathToData + 'articleIDAndGssYearsUsed-cleaned.pickle')) # load the years used
VARS_BY_YEAR = cp.load(open(pathToData + 'VARS_BY_YEAR.pickle'))
# structure of the dictionary above: { year (int) : [ set of variable names (strs), [variable_i, metadata_i] ] }
YEAR_INDICES = cp.load(open(pathToData + 'YEAR_INDICES.pickle'))
VAR_INDICES = cp.load(open(pathToData + 'VAR_INDICES_binary.pickle', 'rb'))
articleClasses = cp.load(open(pathToData + 'articleClasses.pickle', 'rb'))
# Part 1 complete.
# Load GSS data create data
allPropsForYearsUsed = []
allPropsForYearsPossible =[]
allParamSizesForYearsUsed = []
allParamSizesForYearsPossible = []
allRsForYearsUsed, allRsForYearsPossible = [], []
GSSFilename = 'GSS Dataset/GSS7212_R2.sav'
#data = srw.SavReader(pathToData + GSSFilename)
#df = pd.DataFrame(data.all(), index=data[:,0], columns=ALL_VARIABLE_NAMES)
#with data: # this makes sure the file will be closed, memory cleaned up after the program is run
#data = np.array(data.all()) # this makes sure the entire dataset is loaded into RAM, which makes accessing much faster
# NEED TO PUT A SECTION/FUNCTION HERE THAT DOES ALL THE FILTERING IN ONE PLACE, SO THAT BY THE TIME
# THE CODE GETS TO articleClasses, it only works on a subset it can actually process
for article in random.sample(articleClasses, 100):
#for article in articleClasses:
#for article in [articleClasses[-2]]:
print 'Processing article:', article.articleID
if len(article.centralIVs) < 1:
print 'No "central" IVs. Skipping'
continue
# coefficients significant
coeffsTotalForYearsUsed = 0
coeffsSigForYearsUsed = []
coeffsSigForYearsPossible = []
coeffsTotalForYearsPossible = 0
# proportions of significant coeffs
propSigForYearsUsed = 0
propSigForYearsPossible = 0
# parameter sizes
paramSizesForYearsUsed = []
paramSizesForYearsPossible = []
avgParamSizeForYearsUsed = 0.0
avgParamSizeForYearsPossible = 0.0
RsForYearsUsed, RsForYearsPossible = [], []
for year in (article.GSSYearsUsed + article.GSSYearsPossible): # for each GSS year the article used or could've used
for DV in article.DVs: # for each model
# construct DV column
design = df.loc[year, [DV]+article.IVs+article.controls].copy(deep=True) # Need to make a deep copy so that original df isn't changed
# remove columns that are constant; if DV is constant, skip to next model
if len(design[DV].unique()) == 1: continue # if DV constant
for col in design.columns:
if len(design[col].unique()) == 1: # if any IVs or controls constant, drop 'em
design.drop(col, axis=1)
# remove missing values
for col in design.columns:
mv = MISSING_VALUES_DICT[col]
if 'values' in mv:
design[col].replace(mv['values'], [np.nan]*len(mv['values']), inplace=True) # it's important to have inPlace=True
# !!! need to insert the other case heer, where the missing values are in a RANGE with 'higher' and 'lower' bounds
design = design.dropna(axis=0) # drop all rows with any missing values (np.nan)
# skip if there's not enough data after deleting rows
if design.shape[0] < design.shape[1]: # if number of rows is less than number of columns
print 'Not enough IV/control data. Skipping...'
continue
# create formula
formula = DV + ' ~ '
for col in design.columns[1:]: # don't include the DV in the RHS!
if MEASURE_LEVELS[col] == 'ratio': formula += 'center('+ col + ')' # only center() if it's a ratio?
else: formula += ' C(' + col + ')' # i shouldn't center() this?
formula += ' + '
formula = formula[:-2]
# can select which formula to use HERE!
if MEASURE_LEVELS[DV] == 'ratio': results = smf.ols(formula, data=design).fit()
else: results = smf.ols(formula, data=design).fit()
if year in article.GSSYearsUsed:
RsForYearsUsed.append( results.rsquared )
for col in results.pvalues.index:
for iv in article.centralIVs:
if iv in col:
coeffsTotalForYearsUsed += 1
paramSizesForYearsUsed.append(abs(results.params[col]))
if results.pvalues[col] < 0.05:
coeffsSigForYearsUsed.append(results.pvalues[col]) # start at 1 because don't want to count the constant
elif year in article.GSSYearsPossible: # the GSS year the models were run on is a "new" year, (wasn't used in article)
RsForYearsPossible.append( results.rsquared )
for col in results.pvalues.index:
for iv in article.centralIVs:
if iv in col:
coeffsTotalForYearsPossible += 1
paramSizesForYearsPossible.append(abs(results.params[col]))
if results.pvalues[col] < 0.05:
coeffsSigForYearsPossible.append(results.pvalues[col]) # start at 1 because don't want to count the constant
break
'''
#only count CENTRAL IVs
coeffsSigForYearsPossible.extend([el for el in results.pvalues[indicesOfCentralIVs] if el < 0.05]) # start at 1 because don't want to count the constant
coeffsTotalForYearsPossible += len(article.centralIVs)
paramSizesForYearsPossible.extend(results.params[indicesOfCentralIVs])
'''
if coeffsTotalForYearsUsed != 0:
allRsForYearsUsed.append( np.mean(RsForYearsUsed) )
propSigForYearsUsed = float(len(coeffsSigForYearsUsed)) / coeffsTotalForYearsUsed
allPropsForYearsUsed.append(propSigForYearsUsed)
allParamSizesForYearsUsed.append( np.mean(paramSizesForYearsUsed))
if coeffsTotalForYearsPossible != 0:
allRsForYearsPossible.append( np.mean(RsForYearsPossible) )
propSigForYearsPossible = float(len(coeffsSigForYearsPossible)) / coeffsTotalForYearsPossible
allPropsForYearsPossible.append(propSigForYearsPossible)
allParamSizesForYearsPossible.append( np.mean(paramSizesForYearsPossible))
# should i put a delete command for data here?
'''
cp.dump(allPropsForYearsPossible, open(pathToData + 'allPropsForYearsPossible.pickle', 'wb'))
cp.dump(allPropsForYearsUsed, open(pathToData + 'allPropsForYearsUsed.pickle', 'wb'))
'''