hw5.py

import pathlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from typing import Union, Tuple

class QuestionnaireAnalysis:
    """
    Reads and analyzes data generated by the questionnaire experiment.
    Should be able to accept strings and pathlib.Path objects.
    """

    def __init__(self, data_fname: Union[pathlib.Path, str]):
        self.data_fname = pathlib.Path(data_fname)
        self.data = None
        if not self.data_fname.exists():
            raise ValueError("Error! the file does not exist")

    def read_data(self):
        """Reads the json data located in self.data_fname into memory, to
        the attribute self.data.
        """
        #read the json
        with open(self.data_fname, "r") as file:
            self.data = pd.read_json(file)
        return self.data 

 
    def show_age_distrib(self) -> Tuple[np.ndarray, np.ndarray]:
        """Calculates and plots the age distribution of the participants.
        Returns
        -------
        hist : np.ndarray
        Number of people in a given bin
        bins : np.ndarray
        Bin edges
        """
        #read the data
        ndarr = self.read_data()
        #bins
        bins = np.arange(0, 110, 10)
        #number of people in a given bin
        hist = ndarr["age"].dropna().groupby(pd.cut(ndarr.age.dropna(), bins, right = False)).count().astype(float).to_numpy()
        #plot
        plt.hist(ndarr["age"], bins)
        plt.show()

        return (hist, bins)


    def remove_rows_without_mail(self) -> pd.DataFrame:
        """Checks self.data for rows with invalid emails, and removes them.

        Returns
        -------
        df : pd.DataFrame
          A corrected DataFrame, i.e. the same table but with the erroneous rows removed and
          the (ordinal) index after a reset.
            """

        #read the data
        df = self.read_data()#.dropna()

        pd.set_option('display.max_rows', df.shape[0]+1)
        pd.set_option('display.max_columns', None)

        #check for invalid
        df1 = df[df['email'].str.contains(".c")]
        df1 = df1[~df1.email.str.contains("@.c")]
        df1 = df1[~df1.email.str.contains("0@c")]#specific one making issues
        has_shtrudel = df1['email'].str.contains("@")
        df2 = df1[has_shtrudel]
        #print(df2["email"])
        #reset index
        df2 = df2.reset_index()
        #print(df2["email"])
        return df2


    def fill_na_with_mean(self) -> Tuple[pd.DataFrame, np.ndarray]:
        """Finds, in the original DataFrame, the subjects that didn't answer
        all questions, and replaces that missing value with the mean of the
        other grades for that student.

        Returns
        -------
        df : pd.DataFrame
          The corrected DataFrame after insertion of the mean grade
        arr : np.ndarray
              Row indices of the students that their new grades were generated
            """
        ##read data
        df = self.read_data()

        ###find all nan in questions
        #pd.set_option('display.max_rows', df.shape[0]+1)
        #pd.set_option('display.max_columns', None)


        index = set(df["q1"].index[df["q1"].apply(np.isnan)])
        index.update(set(df["q2"].index[df["q2"].apply(np.isnan)]))
        index.update(set(df["q3"].index[df["q3"].apply(np.isnan)]))
        index.update(set(df["q4"].index[df["q4"].apply(np.isnan)]))
        index.update(set(df["q5"].index[df["q5"].apply(np.isnan)]))

        arr = np.sort(np.array(list(index)))
        #print(arr) 
        #replace with mean
        cols = ['q1','q2','q3','q4','q5']

        self.data['q1'].fillna(self.data[cols].mean(axis=1), inplace=True)
        self.data['q2'].fillna(self.data[cols].mean(axis=1), inplace=True)
        self.data['q3'].fillna(self.data[cols].mean(axis=1), inplace=True)
        self.data['q4'].fillna(self.data[cols].mean(axis=1), inplace=True)
        self.data['q5'].fillna(self.data[cols].mean(axis=1), inplace=True)

        #df['q4'].fillna(df['q4'].mean(skipna=True), inplace=True)

        #print(self.data)
        return(self.data, arr)


    def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
        """Calculates the average score of a subject and adds a new "score" column
        with it.

        If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
        score should be NA. Otherwise, the score is simply the mean of the other grades.
        The datatype of score is UInt8, and the floating point raw numbers should be
        rounded down.

        Parameters
        ----------
        maximal_nans_per_sub : int, optional
            Number of allowed NaNs per subject before giving a NA score.

        Returns
        -------
        pd.DataFrame
            A new DF with a new column - "score".
        """

        df = self.read_data()
        pd.set_option('display.max_rows', df.shape[0]+1)
        pd.set_option('display.max_columns', None)

        cols = ['q1','q2','q3','q4','q5']


        for i in df.index:
            if df.loc[i, cols].isnull().sum() >maximal_nans_per_sub:
                df.loc[i, 'score'] = np.nan
            else:
                df.loc[i, 'score'] =  math.floor(df.loc[i, cols].mean())

        #change type
        df['score'] = df['score'].astype("UInt8")
        return (df)

    def correlate_gender_age(self) -> pd.DataFrame:
        """Looks for a correlation between the gender of the subject, their age
        and the score for all five questions.


        Returns
        -------
        pd.DataFrame
            A DataFrame with a MultiIndex containing the gender and whether the subject is above
            40 years of age, and the average score in each of the five questions.
        """
        df = self.read_data()
        print(df)
        # na of age
        df = df[df['age'].notna()]


        #whether their age is above or below 40
        df['age'] = df.age.ge(40)

        #Use the original DataFrame and transform its index into a MultiIndex with three levels: the ordinal index (row number), gender and age.
        df = df.set_index(['gender', 'age'], append=True)
        #Allocate the different subjects into groups based on two parameters: Their gender, and whether their age is above or below 40.
        # Hint - use df.groupby. The result should be similar to what is shown in the figure below (you don't have to plot it yourself).

        df = df.groupby(level=["gender", "age"]).mean()
        df = df.drop(columns='id')
        #print(df )
        return(df)

#ques = QuestionnaireAnalysis(pathlib.Path(r"C:\Users\mandy\source\repos\hw5\data.json"))
#ques.show_age_distrib()
#ques.remove_rows_without_mail()
#ques.fill_na_with_mean()
#ques.score_subjects()
#ques.correlate_gender_age()