forked from GalKepler/hw5
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw5.py
197 lines (156 loc) · 6.66 KB
/
hw5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from typing import Union, Tuple
class QuestionnaireAnalysis:
"""
Reads and analyzes data generated by the questionnaire experiment.
Should be able to accept strings and pathlib.Path objects.
"""
def __init__(self, data_fname: Union[pathlib.Path, str]):
self.data_fname = pathlib.Path(data_fname)
self.data = None
if not self.data_fname.exists():
raise ValueError("Error! the file does not exist")
def read_data(self):
"""Reads the json data located in self.data_fname into memory, to
the attribute self.data.
"""
#read the json
with open(self.data_fname, "r") as file:
self.data = pd.read_json(file)
return self.data
def show_age_distrib(self) -> Tuple[np.ndarray, np.ndarray]:
"""Calculates and plots the age distribution of the participants.
Returns
-------
hist : np.ndarray
Number of people in a given bin
bins : np.ndarray
Bin edges
"""
#read the data
ndarr = self.read_data()
#bins
bins = np.arange(0, 110, 10)
#number of people in a given bin
hist = ndarr["age"].dropna().groupby(pd.cut(ndarr.age.dropna(), bins, right = False)).count().astype(float).to_numpy()
#plot
plt.hist(ndarr["age"], bins)
plt.show()
return (hist, bins)
def remove_rows_without_mail(self) -> pd.DataFrame:
"""Checks self.data for rows with invalid emails, and removes them.
Returns
-------
df : pd.DataFrame
A corrected DataFrame, i.e. the same table but with the erroneous rows removed and
the (ordinal) index after a reset.
"""
#read the data
df = self.read_data()#.dropna()
pd.set_option('display.max_rows', df.shape[0]+1)
pd.set_option('display.max_columns', None)
#check for invalid
df1 = df[df['email'].str.contains(".c")]
df1 = df1[~df1.email.str.contains("@.c")]
df1 = df1[~df1.email.str.contains("0@c")]#specific one making issues
has_shtrudel = df1['email'].str.contains("@")
df2 = df1[has_shtrudel]
#print(df2["email"])
#reset index
df2 = df2.reset_index()
#print(df2["email"])
return df2
def fill_na_with_mean(self) -> Tuple[pd.DataFrame, np.ndarray]:
"""Finds, in the original DataFrame, the subjects that didn't answer
all questions, and replaces that missing value with the mean of the
other grades for that student.
Returns
-------
df : pd.DataFrame
The corrected DataFrame after insertion of the mean grade
arr : np.ndarray
Row indices of the students that their new grades were generated
"""
##read data
df = self.read_data()
###find all nan in questions
#pd.set_option('display.max_rows', df.shape[0]+1)
#pd.set_option('display.max_columns', None)
index = set(df["q1"].index[df["q1"].apply(np.isnan)])
index.update(set(df["q2"].index[df["q2"].apply(np.isnan)]))
index.update(set(df["q3"].index[df["q3"].apply(np.isnan)]))
index.update(set(df["q4"].index[df["q4"].apply(np.isnan)]))
index.update(set(df["q5"].index[df["q5"].apply(np.isnan)]))
arr = np.sort(np.array(list(index)))
#print(arr)
#replace with mean
cols = ['q1','q2','q3','q4','q5']
self.data['q1'].fillna(self.data[cols].mean(axis=1), inplace=True)
self.data['q2'].fillna(self.data[cols].mean(axis=1), inplace=True)
self.data['q3'].fillna(self.data[cols].mean(axis=1), inplace=True)
self.data['q4'].fillna(self.data[cols].mean(axis=1), inplace=True)
self.data['q5'].fillna(self.data[cols].mean(axis=1), inplace=True)
#df['q4'].fillna(df['q4'].mean(skipna=True), inplace=True)
#print(self.data)
return(self.data, arr)
def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
"""Calculates the average score of a subject and adds a new "score" column
with it.
If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
score should be NA. Otherwise, the score is simply the mean of the other grades.
The datatype of score is UInt8, and the floating point raw numbers should be
rounded down.
Parameters
----------
maximal_nans_per_sub : int, optional
Number of allowed NaNs per subject before giving a NA score.
Returns
-------
pd.DataFrame
A new DF with a new column - "score".
"""
df = self.read_data()
pd.set_option('display.max_rows', df.shape[0]+1)
pd.set_option('display.max_columns', None)
cols = ['q1','q2','q3','q4','q5']
for i in df.index:
if df.loc[i, cols].isnull().sum() >maximal_nans_per_sub:
df.loc[i, 'score'] = np.nan
else:
df.loc[i, 'score'] = math.floor(df.loc[i, cols].mean())
#change type
df['score'] = df['score'].astype("UInt8")
return (df)
def correlate_gender_age(self) -> pd.DataFrame:
"""Looks for a correlation between the gender of the subject, their age
and the score for all five questions.
Returns
-------
pd.DataFrame
A DataFrame with a MultiIndex containing the gender and whether the subject is above
40 years of age, and the average score in each of the five questions.
"""
df = self.read_data()
print(df)
# na of age
df = df[df['age'].notna()]
#whether their age is above or below 40
df['age'] = df.age.ge(40)
#Use the original DataFrame and transform its index into a MultiIndex with three levels: the ordinal index (row number), gender and age.
df = df.set_index(['gender', 'age'], append=True)
#Allocate the different subjects into groups based on two parameters: Their gender, and whether their age is above or below 40.
# Hint - use df.groupby. The result should be similar to what is shown in the figure below (you don't have to plot it yourself).
df = df.groupby(level=["gender", "age"]).mean()
df = df.drop(columns='id')
#print(df )
return(df)
#ques = QuestionnaireAnalysis(pathlib.Path(r"C:\Users\mandy\source\repos\hw5\data.json"))
#ques.show_age_distrib()
#ques.remove_rows_without_mail()
#ques.fill_na_with_mean()
#ques.score_subjects()
#ques.correlate_gender_age()