You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
pd.read_csv('filename.csv')
pd.read_csv('filename.csv', index_col='Column')
var.to_csv('new_name.csv', index=False) # Create a new file from a variable
Methods
.head()
.tail()
.info()
.describe() # statistical summary
.min() # min data per column
.max() # max data per column
.count() # how many values we have, does not include missing values
.mean()
.std() # Standar deviation
.ptp() # Difference between max - min values
.median() # Medianna
.corr() # correlation of numerical columns
.unique() # How many unique values
.nunique() # How many unique values per column
.value_counts() # How many values a value has
.copy() # Creates a copy from the main variable
.sort_values('column') # Return sorted values according to column selected
.sort_values(['column1','column2','column3'])
.nlargest(n=5, columns='column_name') # Returns largest sorted values per column selected
.nsmallest(n=5, columns='column_name') # Returns smallest sorted values per column selected
.isna() # This checks if every element is a missing value or not
.isna().sum() # sum missing values
.notna().sum() # sums actual values
.dropna(axis=column(1) orrow(0), thresh=500# column with more than 500 na values, inplace = True) # this deletes na values from a columnvar_mean=var.column.mean()
.column.fillna(value=var_mean, inplace=True) # Replace na valuevar.column.corr(var.column) # specific correlation from two columns
.agg(['mean','std','min','max','median'])
# Create a customized set of summary statistics with only one line of code
.astype('int')
Methods values
.method(skip=True#Skips not assign value NA)
.nunique(dropna=True#Not counting NA Values
.method(sort=True# Sort values)
.method(ascending=True# Sorting type)
.method(bins=5# Returns statiscal ranges)
.method(inplace=True# Save the change)
Built-in functions
type(dataframe) # Shows the typeround(dataframe,0) # Round every number in the dataframe to a number of decimalslen(dataframe) # Shows how many rows we've got
Attributes
.columns
.index
.shape# how many data there are and columns
.size
.axes# Returns both rows and colums
Functions
type(var)
len(var) # counts how many values we haveround(var) # round numbers
Indexing and Slicing
var.reset_index(inplace=True)
var.set_index('Column', inplace=True) # Change index columnvar.index=var_index# var_index is a previously created listvar['column'] # returns only data from that columnvar[['column']] # same but better displayvar[['column1','column2']] # This is a listvar.column# returns data from that column, different methodvar.iloc[0] # The best way to select rowsvar.iloc[200:250] # Select a range of rowsvar.iloc[[2,5,200]] # Select a list of rowsvar.iloc[row, column] # Select a row and a specific columnvar.loc['DRIVAS, Dimitrios'] # Selecting specific datavar.loc['LASTNAME, FNAME'].iloc[0]
var.loc['LASTNAME, FNAME',['Column1','Column2']
var.iloc[2] =29# changing one item's valuevar.columns[columnindexnumber]
var.columns.tolist() # Past values into a listvar.index.get_loc('LNAME, FNAME') # Gets index locationvar.colums= ['New Name1','New Name2',...,'New Name(n)'] # Changing columns namesvar.index.name='New Index Name'# Changing index namevar.rename(mapper= {'Index Name':'New Name'}, axis='index') # Changing a values namevar.rename(indexorcolumn= {'Index Name':'New Name'}, inplace=True) # Different methodvar[var.column=='value_we_want'] ex. maleorfemale# Sorting from a value var.loc[var.column=='value_we_want'] Betterwaytodoitvar.loc[var.column.between(range,range, inclusive=True)
var.loc[var.column.isin(num1,num2)]
var.loc[var.column.isin(df.column)] # Slice a df with another dataframevar.duplicate(subset="column") # Find duplicatesdf[(var.ticker=='AXP') & (var.per_cal_qtr==1) & (var['per_cal_year'] ==2021)] # Slicing by more than one column~# This symbol means de opposite&|# and & or
(var1&var2) # Previously created var1 = var.column == 'val'var.loc[var1&var2] # filtering data with several variablesvar1|var2) # means orvar.loc[var1|var2]