pandas_cheatsheet


source https://www.dataquest.io/blog/pandas-python-tutorial/
1) read csv
pd.read_csv("../../")

2) find the shape of data frame df.shape --->it describes the rows and columns
 
3) querying using iloc and loc ---> print(df.loc[df['score_phrase'] == "Amazing"][["release_month","genre"]])
4) print(df.loc[df["Staff Group"]=="CHSI",["Epl_days","nfl_days"]])
   print(df[df["Staff Group"]=="CHSI"][["Epl_days","nfl_days"]])
   
  5)Assigning a list to pandas data frame 
  df["Name of the column"]=list


7)Remove duplicates -->initial_data.drop_duplicates(subset=['caseId'], keep="first",inplace=True)

8)non nan initial_data = initial_data[initial_data['caseId']!='nan'] 
9) data['Staff Group1']=data['Staff Group'].apply(lambda x:{'Billing':1, 'CHSI':2, 'New Products':3, 'Order Entry':4, 'Repair':5, 'Sales':6}.get(x,x))
10) labels = {'a': 1,'b': 0,'c':2}
    data["NPS"] = [labels[item] for item in data["NPS"]]
    data['diagnosis'] = data['diagnosis'].map({'M':1, 'B':0});
    
11) plot points for smote checking
pd.value_counts(data['Class']).plot.bar()
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
data['Class'].value_counts()


12)--------------standard scaler
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)
data.head()

13-----------------------------query with np.where 

np.where(df[condition],x,y)

  condition will give true or false to the series to all  the values 
  and based on the true and false the values of x y (x,y can be series also)will be appended to the data frame series
  
  
14------------------------------merge datframe based on column 
                                foreg 
                              ID      value
           dfnew               d1      5
                               d2      6

         original df 
         ID col1 col2 col3 
         d1   5   6    7 
         d2   7   8    8 
         d1   1   5    7
      
      df.merge(dfnew on="ID",how="left)
15 #----------------------------------------------- get the lowest value of a column based on groups and renaming the column and creating new in nw df----------------
      latestdf=df.dataframe(df.groupby(by=["ID"])["col1"].apply(np.min).reset_index().rename(columns={"col1":"new_col1"}))
      
      
16 #-------------- Aggregate ‘sum’ and ‘min’ function across all the columns in data frame.------------------------------------

               df.aggregate(['sum', 'min'])
               we can also use groupby with this method if we need grouping output 
    fidning the max value of col            df.dataframe(df.groupby(by=["ID"]).agg({"col2":max}).reset_index().rename(columns={"col2":"new_col2_2"}))
    
 !7-------------------renaming columns header--------------------------
               df.rename(columns={old:new},inplace=True)
 
 
 18----------------Multiple groupby and aggregate base on dictionary 
 
 multiple_group=df.groupby(["ID","col2"]).agg(agg_val).reset_index()

# coding=utf8

import pandas as pd
from Code.config import NPS_values
import numpy as np
from Code import config
from datetime import datetime

class ChatInteractions(object):
    # def load_data(self):
    #     # initial_data = pd.read_csv("Training 5k Chat 1.0.csv",
    #     # encoding="iso-8859-1")
    #     initial_data = pd.read_csv("Testing 5k Chat 1.0.csv",encoding="iso-8859-1")
    #     initial_data = initial_data[initial_data['accountNumber'] != 'nan']
    #     # data = preprocess_data(initial_data[cols])
    #     # initial_data=initial_data[initial_data['accountNumber']=='"8155100020416180"']
    #     # initial_data=initial_data[initial_data['accountNumber']=='"8155100525501148"']
    #     self.get_interactions(initial_data)

    def get_interactions(self,data):
        data.dropna(subset=['chatStartDateTime'], inplace=True)
        gp_data=data.groupby("accountNumber")
        training_data = pd.DataFrame(columns=config.train_cols)
        for name in gp_data.groups:
            group=gp_data.get_group(name)
            try:
                group['chatStartDateTime'] = group[
                    'chatStartDateTime'].apply(lambda x:datetime.strptime(str(x),
                                                                           "%m/%d/%Y %H:%M"))
            except:
                group['chatStartDateTime']=group['chatStartDateTime']
            group.sort_values(by=['chatStartDateTime'], inplace=True,
                              ascending=False)
            # n_v=group['NPS'].dropna()
            n = group['NPS'].dropna()
            data_row = self.get_row(group)
            data_row['same_row']=len(n)==len(data_row)
            if len(n)!=len(data_row):
                print(group['accountNumber'])
            training_data=pd.concat([training_data,data_row],
                                    ignore_index=True,sort=True)
        training_data = training_data[training_data['accountNumber']!='"nan"']
        # training_data.to_csv("Training 5k interactions 1.0.csv",index=False)
        # training_data.to_csv("Testing 5k interactions 1.0.csv",index=False)
        return training_data

    def get_row(self,df):
        data_row = pd.DataFrame(columns=config.train_cols)
        new_df = pd.DataFrame(data=df[config.cols].values[:,:],
                              columns=config.cols)
        while(len(new_df.loc[new_df['NPS'].isin(NPS_values)].index.values)>0):
            last_date= max(list(new_df[new_df['NPS'].isin(NPS_values)]['chatStartDateTime']))
            new_df = new_df[~(new_df['chatStartDateTime']>last_date)]
            new_df['DeadAirInstances']=new_df['DeadAirInstances'].fillna(new_df['DeadAirInstances'].mean())
            new_df['DeadAirTotalMinutes']=new_df['DeadAirTotalMinutes'].fillna(new_df['DeadAirTotalMinutes'].mean ())
            # my_frame=new_df[cols].mean()
            my_frame = self.get_weighted_data(new_df)
            my_frame['NPS'] = new_df['NPS'].values[0]
            my_frame['contentTextInternal'] =new_df['contentTextInternal'].values[0]
            my_frame['caseId'] = new_df['caseId'].values[0]
            my_frame['accountNumber'] = new_df['accountNumber'].values[0]
            new_df = new_df[1:]
            caseIDs = list(new_df['caseId'])
            new_df = df[df['caseId'].isin(caseIDs)][config.cols]
            my_frame = pd.DataFrame(my_frame).transpose()
            data_row = pd.concat([data_row,my_frame],ignore_index=True, sort=True)
        return data_row

    def get_weighted_data(self,data):
        depreciation_rate = 1/len(data)
        weights = [1]
        for i in range(1,len(data)):
            weights.append(weights[-1]*(1-depreciation_rate))
        result = data[config.weighted_cols].apply(lambda x: np.asarray(x) * np.asarray (weights), axis=0)
        data[config.weighted_cols] = result
        return data[config.mean_cols[1:]].mean()


pandas groupby by with sum
df.groupby([col1,col2]).agg({col3:sum}).reset_index()
df.groupby([col1,col2]).agg({col3:[sum,len]}).droplevel(0,axis=1).rename(columns={"sum":"sumcol","len":"supplier_count"}).astype(int).reset_index()
df.groupby([col1]).agg({col3:np.mean}).reset_index()