forked from AnupamKhare/code-Repository
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpython codes
456 lines (337 loc) · 14.4 KB
/
python codes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
########## . removing infinite value and replacing with nans###########
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0)
####################### Creating a new column based on if-elif-else condition
df.loc[(df['A'] == df['B']), 'C'] = 0
df.loc[(df['A'] > df['B']), 'C'] = 1
df.loc[(df['A'] < df['B']), 'C'] = -1
if column A is equal to column B then create and set column C equal to 0.
################## creating dummies anf deleting last varible to create n-1###########
main_areas_dummies = pd.get_dummies(df4['main_areas'],prefix='Area').iloc[:,:-1]
######### Creating Varible using multiple ifelse#########
df['City'] = pd.np.where(df.Location.str.contains("Pune"), "Pune",
pd.np.where(df.Location.str.contains("Bengaluru"), "Bangalore",
pd.np.where(df.Location.str.contains("Delhi"), "Delhi",
pd.np.where(df.Location.str.contains("Kolakata"), "Kolkata",
pd.np.where(df.Location.str.contains("Mumbai"), "Mumbai",
pd.np.where(df.Location.str.contains("Hyderabad"), "Hyderabad",
pd.np.where(df.Location.str.contains("Gurgaon"), "Gurgaon",
pd.np.where(df.Location.str.contains("Marathalli"), "Bangalore",
pd.np.where(df.Location.str.contains("Noida"), "Noida",
pd.np.where(df.Location.str.contains("Whitefield"), "Bangalore",
pd.np.where(df.Location.str.contains("Electronic City"), "Bangalore",
pd.np.where(df.Location.str.contains("BTM"), "Bangalore",'None'))))))))))))
#########
dataset.head(5).transpose()
#### Plotting categorical columns with Unique values
#Define a dataframe that will hold the plot values
df_cat = pandas.DataFrame(columns=['attribute',
'num_unique_values','unique_values'])
df_cat['num_unique_values'] = df_cat['num_unique_values'].astype(int)
#populate the dataframe with the non numeric attributes and
#the number of unique values
row = 0
df_nonnumeric = dataset.select_dtypes(exclude=NUMERICS)
#Get the list of possible categorical columns
categorical_cols = df_nonnumeric.columns
for col in categorical_cols:
df_cat.loc[row] = [col,len(dataset[col].unique()),
dataset[col].unique()]
row = row+1
layout = go.Layout(
autosize=False,
height=2000,
font=dict(family=CODE_FONT),
xaxis=dict(
title='Number of unique values',
titlefont=dict(
family=TEXT_FONT,
size=16,
color=COLOR
),
),
yaxis=dict(
title='Attribute',
titlefont=dict(
family=TEXT_FONT,
size=16,
color=COLOR
),
),
annotations=[
dict(x=xi,y=yi,
text=str(xi),
xanchor='left',
yanchor='center',
showarrow=False,
) for xi, yi in zip(df_cat['num_unique_values'],
df_cat['attribute'])]
)
data = [
go.Bar(
y=df_cat['attribute'],
x=df_cat['num_unique_values'],
orientation = 'h',
marker = dict(
color=COLOR,
line = dict(
color = SECONDARY_COLOR,
width = 1)
)
)
]
fig = go.Figure(data=data,layout=layout)
offline.iplot(fig,show_link=False)
#################### add columns to dataset with various conditions
# Add a column False Positive comments to test dataset
def FP_com(test):
if test['HitANDRun'] == 1:
return 'HitANDRun'
elif test['MedPaySubroElig'] == 0 and ['PIPSubroElig'] == 0 and test['InsuredBILoss'] == 0 and \
test['InsuredVehicleLoss'] == 0 and test['InsuredPIPLoss'] == 1 and test['InsuredMedPayLoss'] == 1:
return 'MPandPIPIneligible'
elif test['MedPaySubroElig'] == 0 and test['InsuredBILoss'] == 0 and test['InsuredVehicleLoss'] == 0 and \
test['InsuredPIPLoss'] == 0 and test['InsuredMedPayLoss'] == 1:
return 'MedPayIneligible'
elif test['PIPSubroElig'] == 0 and test['InsuredBILoss'] == 0 and test['InsuredVehicleLoss']== 0 and \
test['InsuredPIPLoss'] == 1 and test['InsuredMedPayLoss'] == 0:
return 'PIPIneligible'
else:
return None
test['Comment'] = test.apply(FP_com, axis=1)
########################################## Creating dummy python way creates 1,0##########
# Discretize General Notes Count
dis_list = ["Num_AdverseParties", "Num_ThirdParty_Vehicles", "GenNotesCount"]
for i in dis_list:
i_cat = i + "_cat"
test[i_cat]=test[i].apply(lambda x: 1 if x>0 else 0)
######################### Columsn exist in train and vice versa#######
# Columns that do not exist in Test
main_lst_train = []
main_lst_test = []
for i in varlist:
tr_uniq = []
tst_uniq = []
tr_arr = train[i].unique()
tr_uniq.append(tr_arr.tolist())
tst_arr = test[i].unique()
tst_uniq.append(tst_arr.tolist())
# Columns not in Test but in Train
try:
for p in range(len(tr_uniq[0])):
if tr_uniq[0][p] not in tst_uniq[0]:
if tr_uniq[0][p] is np.nan:
pass
else:
ap = tr_uniq[0][p]
ap = i + '_' + str(ap)
main_lst_train.append(ap)
except:
pass
# COlumns not in Train but in Test
try:
for p in range(len(tst_uniq[0])):
if tst_uniq[0][p] not in tr_uniq[0]:
if tst_uniq[0][p] is np.nan:
pass
else:
ap = tst_uniq[0][p]
ap = i + '_' + str(ap)
main_lst_test.append(ap)
except:
pass
if len(main_lst_train) > 0:
for i in main_lst_train:
try:
X_test[i] = 0
except:
pass
if len(main_lst_test) > 0:
for i in main_lst_test:
try:
X_test.drop(i, axis=1, inplace=True)
except:
pass
############ Creating new column based on condition########
cp=[]
for i in test.ClosedWithOutPay:
if i==False:
cp.append(0)
if i==True:
cp.append(1)
test['ClosedWithOutPay']=cp
############ substituting and creating new columns
Data.loc[Data.RecoverySubrogationTotal >0, 'PositiveOutcome'] =1
Data.loc[Data.RecoverySubrogationTotal<=0, 'PositiveOutcome'] =0
######################### cleaning text data#########
Data["lower"]=Data.LossDescription.str.lower().str.split()
Data["lower"]=Data.lower.apply(lambda x: [re.sub(r'[0-9]+',r'',item) for item in x ])
Data["lower"]=Data['lower'].apply(lambda x: [re.sub(r'[^a-zA-Z0-9 ]',r' ',item) for item in x ])
########### substituting values within a column
for i in range(len(Data.LossDescription)):
if type(Data.LossDescription[i])== float:
Data.LossDescription[i]= ''
# this code will tell what are matching and non matching rows in merge
pd.merge(df1, df2, on='A',how='left', indicator=True)
# creating new variable three methods
np.where(df['age'] <= 9, 'child', df['sex'])
df['sex'].where(df['age'] > 9, 'child')
df.apply(lambda x: 'child' if x['age'] <= 9 else x['sex'], axis=1)
#only selecting rows which (based on column) which not present in def and present in abc
abc[~abc.x1.isin(def.x1)]
#only selecting rows which (based on column) which are present in def and present in abc
abc[abc.x1.isin(def.x1)]
# filter rows
df.query('a > b')
# conditions in Pandas
C = np.where(cond, A, B) if true then A else B
#If you have more than one condition, then you could use np.select instead.
For example, if you wish df['que'] to equal df['two'] when df['one'] < df['two'], then
conditions = [
(df['one'] >= df['two']) & (df['one'] <= df['three']),
df['one'] < df['two']]
choices = [df['one'], df['two']]
df['que'] = np.select(conditions, choices, default=np.nan)
#Creating timestamp for today
timestamp = ('{:%Y_%m_%d_%H_%M_%S}'.format(datetime.datetime.now()))
#Creating timestamp for n days from today
timestamp_Yesterday='{:%Y_%m_%d}'.format((datetime.datetime.now()) - (datetime.timedelta(days=n))) # put the number of days in n
#selecting columns based on datatype
df.select_dtypes(include=['int'])
df.select_dtypes(include=[np.number])
df.select_dtypes(exclude=[object])
#if else in a list comprehension
# List unique values in a DataFrame column
pd.unique(df.column_name.ravel())
# Convert Series datatype to numeric, getting rid of any non-numeric values
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)
# Grab DataFrame rows where column has certain values
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(valuelist)]
# Grab DataFrame rows where column doesn't have certain values
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]
# Delete column from DataFrame
del df['column']
# Select from DataFrame using criteria from multiple columns
# (use `|` instead of `&` to do an OR)
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
# Rename several DataFrame columns
df = df.rename(columns = {
'col1 old name':'col1 new name',
'col2 old name':'col2 new name',
'col3 old name':'col3 new name',
})
# Lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)
# Even more fancy DataFrame column re-naming
# lower-case all DataFrame column names (for example)
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)
# Loop through rows in a DataFrame
# (if you must)
for index, row in df.iterrows():
print index, row['some column']
# Next few examples show how to work with text data in Pandas.
# Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html
# Slice values in a DataFrame column (aka Series)
df.column.str[0:2]
# Lower-case everything in a DataFrame column
df.column_name = df.column_name.str.lower()
# Get length of data in a DataFrame column
df.column_name.str.len()
# Sort dataframe by multiple columns
df = df.sort(['col1','col2','col3'],ascending=[1,1,0])
# Get top n for each group of columns in a sorted dataframe
# (make sure dataframe is sorted first)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)
# Grab DataFrame rows where specific column is null/notnull
newdf = df[df['column'].isnull()]
# Select from DataFrame using multiple keys of a hierarchical index
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))
# Change all NaNs to None (useful before
# loading to a db)
df = df.where((pd.notnull(df)), None)
# Get quick count of rows in a DataFrame
len(df.index)
# Pivot data (with flexibility about what what
# becomes a column and what stays a row).
# Syntax works on Pandas >= .14
pd.pivot_table(
df,values='cell_value',
index=['col1', 'col2', 'col3'], #these stay as columns; will fail silently if any of these cols have null values
columns=['col4']) #data values in this column become their own column
# Change data type of DataFrame column
df.column_name = df.column_name.astype(np.int64)
# Get rid of non-numeric values throughout a DataFrame:
for col in refunds.columns.values:
refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
# Set DataFrame column values based on other column values (h/t: @mlevkov)
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value
# Clean up missing values in multiple DataFrame columns
df = df.fillna({
'col1': 'missing',
'col2': '99.999',
'col3': '999',
'col4': 'missing',
'col5': 'missing',
'col6': '99'
})
# Concatenate two DataFrame columns into a new, single column
# (useful when dealing with composite keys, for example)
df['newcol'] = df['col1'].map(str) + df['col2'].map(str)
# Doing calculations with DataFrame columns that have missing values
# In example below, swap in 0 for df['col1'] cells that contain null
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
# Split delimited values in a DataFrame column into two new columns
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
# Collapse hierarchical column indexes
df.columns = df.columns.get_level_values(0)
# Convert Django queryset to DataFrame
qs = DjangoModelName.objects.all()
q = qs.values()
df = pd.DataFrame.from_records(q)
# Create a DataFrame from a Python dictionary
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])
# Get a report of all duplicate records in a dataframe, based on specific columns
dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)]
# Set up formatting so larger numbers aren't displayed in scientific notation (h/t @thecapacity)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
##################### Text Preprocessing#####################
rom nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'[a-z]+')
stop_words = set(stopwords.words('english'))
def preprocess(document):
document = document.lower() # Convert to lowercase
words = tokenizer.tokenize(document) # Tokenize
words = [w for w in words if not w in stop_words] # Removing stopwords
# Lemmatizing
for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
words = [wordnet_lemmatizer.lemmatize(x, pos) for x in words]
return " ".join(words)
############################# String Function
###Extract only numbers
df['Average_Cost']=df['Average_Cost'].astype(str).str.extract("([-+]?\d*\.\d+|[-+]?\d+)").astype(float)
#################convert string column to float by removing characters##################################
def value_to_float(x):
if type(x) == float or type(x) == int:
return x
if 'K' in x:
if len(x) > 1:
return float(x.replace('K', '')) * 1000
return 1000.0
if 'M' in x:
if len(x) > 1:
return float(x.replace('M', '')) * 1000000
return 1000000.0
if 'B' in x:
return float(x.replace('B', '')) * 1000000000
return 0.0
df."Insert data series column" = df."Insert data series column" .apply(value_to_float)
OR
df['Val'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)