-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocessing.py
396 lines (300 loc) · 15 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 27 07:39:46 2021
@author: lpott, hamlinliu17
"""
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
# TODO: check if you can increase the efficiency
def create_df(filename=None,size="1m"):
"""
Parameters
----------
filename : string, optional
The filename (and path) to the .dat file containing users, items, ratings, and timestamps. The default is None.
Returns
-------
pandas dataframe
returns a sorted by timestamp pandas dataframe with 4 columns: user id, item id, rating, and timestamp.
"""
print("="*10,"Creating DataFrame","="*10)
if size == "1m":
# read in the .dat file, and the entries on each line are separated by '::'
df = pd.read_csv(filename,sep='::',header=None, engine='python')
df.columns= ['user_id','item_id','rating','timestamp']
elif size == "20m":
df = pd.read_csv(filename,header=0,names=['user_id','item_id','rating','timestamp'])
else:
print("Not a proper size, or file not found")
return
# sort the dataframe by the timestamp, and drop the new "index" that appears from the sort
df.sort_values('timestamp',inplace=True)
# group all rows in the dataframe by user, and then get the length of each user session
user_group = df.groupby('user_id').size()
# get the number of unique users, items, ratings, and timestamps (number of unique values in each column)
print(df.nunique())
# get the shape of the dataframe (rows x columns)
print(df.shape)
# print statistics about the user session lengths such as max, min, and average
print("Minimum Session Length: {:d}".format(user_group.min()))
print("Maximum Session Length: {:d}".format(user_group.max()))
print("Average Session Length: {:.2f}".format(user_group.mean()))
return df.reset_index(drop=True)
def create_movie_df(filename=None,size="1m"):
"""
Parameters
----------
filename : string, optional
The filename (and path) to the .csv file containing movie ids, title, genre, imdbid, tmdbid, plot . The default is None.
Returns
-------
pandas dataframe
returns a sorted by timestamp pandas dataframe with 4 columns: user id, item id, rating, and timestamp.
"""
print("="*10,"Creating Movie Info DataFrame","="*10)
if size == "1m":
# read in the .dat file, and the entries on each line are separated by '::'
#df = pd.read_csv(filename,sep='::',header=None)
#df.columns= ["item_id", "title","genre","imdb_id","tmbd_id","mplot"]
df = pd.read_csv(filename,header=0,names=["item_id","title","genre","mplot"])
elif size == "20m":
# read in the .csv file, and the entries
df = pd.read_csv(filename,header=0,names=["item_id", "title","genre","imdb_id","tmbd_id","mplot"])
# get the shape of the dataframe (rows x columns)
print(df.shape)
plot_sizes = df[-df.mplot.isna()].mplot.apply(lambda x: len(str(x).split()))
number_missing = df.mplot.isna().sum()
# if there is no movie plot for a movie, make the movie plot the title of the movie
df.mplot[df.mplot.isna()] = df.title[df.mplot.isna()]
# convert all the movie plots to strings (inacase there are just numbers)
df.mplot = df.mplot.apply(str)
# print statistics about the user session lengths such as max, min, and average
print("Minimum Plot Length: {:d}".format(plot_sizes.min()))
print("Maximum Plot Length: {:d}".format(plot_sizes.max()))
print("Average Plot Length: {:.2f}".format(plot_sizes.mean()))
print("Number of missing plots: {:d}".format(number_missing))
# get the number of unique movie id, etc (number of unique values in each column)
print(df.nunique())
return df.reset_index(drop=True)
def filter_df(df=None,item_min=10):
"""
Parameters
----------
df : pandas dataframe, optional
The pandas dataframe where each row is a user id, item id, rating, and timestamp. The default is None.
item_min : TYPE, optional
The minimum number of items required in a session length (otherwise get rid of user session). The default is 10.
Returns
-------
filter_df : pandas dataframe
The input dataframe, but now any user with a session length less than item_min is removed from the dataframe.
"""
print("="*10,"Filtering Sessions <= {:d} DataFrame".format(item_min),"="*10)
if df is None:
return
# groupo all the rows in the dataframe by user, and then get the length of each user session
user_counts = df.groupby('user_id').size()
# see which users have a session length greater than or equal to item_min
user_subset = np.in1d(df.user_id,user_counts[user_counts >= item_min].index)
# keep only the users with session length greater than or equal to item_min
# reset the index...
filter_df = df[user_subset].reset_index(drop=True)
# check to make sure there are no user session lengths less than item_min
assert (filter_df.groupby('user_id').size() < item_min).sum() == 0
# group all rows in the dataframe by user, and then get the length of each user session
user_group = filter_df.groupby('user_id').size()
# get the number of unique users, items, ratings, and timestamps (number of unique values in each column)
print(filter_df.nunique())
# get the shape of the dataframe (rows x columns)
print(filter_df.shape)
# print statistics about the user session lengths such as max, min, and average
print("Minimum Session Length: {:d}".format(user_group.min()))
print("Maximum Session Length: {:d}".format(user_group.max()))
print("Average Session Length: {:.2f}".format(user_group.mean()))
return filter_df
def create_user_history(df=None):
"""
Parameters
----------
df : TYPE, optional
DESCRIPTION. The default is None.
Returns
-------
user_history : dictionary
A dictionary where each key is the user id and each value is a list of the user session history.
I.e. user_id = 5 , session = [1,4,7,2] ...
"""
if df is None:
return None
print("="*10,"Creating User Histories","="*10)
"""
# initialize empty user dictionary
user_history = {}
# iterate through each user id
for uid in tqdm(df.user_id.unique()):
# get the user session for user id uid
sequence = df[df.user_id == uid].item_id.values.tolist()
# save session as value in dictionary corresponding to key uid
user_history[uid] = sequence
"""
user_history = {uid : df[df.user_id == uid].item_id.values.tolist() for uid in tqdm(df.user_id.unique())}
return user_history
def convert_genres(df, null_genre="NULL"):
"""
Parameters
----------
df: pandas dataframe
dataframe that is outputed from create_movie_df
Returns
-------
df: same df but with the genre column changed
"""
new_df = df[['item_id', 'genre']].copy()
new_df['genre'] = new_df.genre.apply(lambda x: x.split('|'))
max_genres = new_df['genre'].apply(lambda x: len(x)).max()
def filling_genres(x):
fill_value = null_genre
added_null = max_genres - len(x)
return x + added_null * [fill_value]
new_df['genre'] = new_df.genre.apply(filling_genres)
return new_df
class reset_df(object):
def __init__(self):
print("="*10,"Initialize Reset DataFrame Object","="*10)
# initialize labelencoder (which encode target labels with value between 0 and n_classes-1.)
self.item_enc = LabelEncoder()
self.user_enc = LabelEncoder()
self.genre_enc = LabelEncoder()
def fit_transform(self, df, movie_df=None):
"""
Parameters
----------
df : pandas dataframe
The pandas dataframe where each row is a user id, item id, rating, and timestamp.
movie_df: pandas dataframe
dataframe that is outputed from convert_genres
Returns
-------
df : pandas dataframe
The pandas dataframe with the item ids and user ids mapped to a value between 0 and n_unique_item_ids-1 and 0 and n_unique_user_ids-1 respectively.
movie_df
"""
print("="*10,"Resetting user ids and item ids in DataFrame","="*10)
#new_movie_df = self.encoding_genres(movie_df)
# encode item ids with value between 0 and n_classes-1.
df['item_id'] = self.item_enc.fit_transform(df['item_id'])
# encode movie ids with value between 0 and n_classes-1.
df['user_id'] = self.user_enc.fit_transform(df['user_id'])
if movie_df is not None:
print("="*10,"Resetting movie item ids in movie DataFrame","="*10)
movie_df['item_id'] = [self.item_enc.transform([itemid]).item() if (itemid in self.item_enc.classes_) else -1 for itemid in movie_df.item_id]
encodings = np.unique(np.concatenate(movie_df['genre'].tolist()))
self.genre_enc.fit(encodings)
movie_df['genre'] = movie_df.genre.apply(self.genre_enc.transform)
movie_df = movie_df[movie_df.item_id != -1].reset_index(drop=True)
# adding pad genres
len_genres = len(movie_df['genre'][0])
movie_df.loc[len(movie_df.index)] = [len(movie_df.index), np.array([self.genre_enc.transform(["NULL"]).item()] * len_genres)]
# make sure that the item id and the user id both start at 0
assert df.user_id.min() == 0
assert df.item_id.min() == 0
if movie_df is not None:
return df, movie_df
else:
return df
def inverse_transform(self,df):
"""
Parameters
----------
df : pandas dataframe
The pandas dataframe where each row is a user id, item id, rating, and timestamp.
Returns
-------
df : pandas dataframe
The pandas dataframe with the item ids and user ids mapped back to their original item ids and user ids respectively.
"""
# Transform item ids back to original encoding.
df['item_id'] = self.item_enc.inverse_transform(df['item_id'])
# Transform user ids back to original encoding.
df['user_id'] = self.user_enc.inverse_transform(df['user_id'])
return df
def train_val_test_split(user_history=None,max_length=200):
"""
Leave-one-out training scheme
Parameters
----------
user_history : dictionary
A dictionary where each key is the user id and each value is a list of the user session history.
I.e. user_id = 5 , session = [1,4,7,2] ...
max_length : integer, optional
The maximum length that a user session is allowed to be. If a user session is longer than the maximum session length, it will be cut
by taking the last max_length items. The default is 200.
Returns
-------
train_history : dictionary
A dictionary where each key is the user id and each value is a list of the last max_length items before the last 2 items in a user session history
I.e. user_id = 5 , session = [1,4,7,2 ...] ....
val_history : dictionary
A dictionary where each key is the user id and each value is a list of the last max_length items before the last item in a user session history
I.e. user_id = 5 , session = [1,4,7,2 ...] .... test_history : dictionary
test_history : dictionary
A dictionary where each key is the user id and each value is a list of the last max_length items in a user session history
I.e. user_id = 5 , session = [1,4,7,2 ...] .... test_history : dictionary
"""
if user_history is None:
return None
# add 1 to the maximum length of the parameter to truly get user sessions of max length
# (is max length is 40, then we want 41 because 0:40 is train input whereas 1:41 is label)
max_length = max_length + 1
print("="*10,"Splitting User Histories into Train, Validation, and Test Splits","="*10)
# initialize empty user dictionary for train, validation, and test dictionarys
train_history = {}
val_history = {}
test_history = {}
# iterate through each user and corresponding user session history
for key,history in tqdm(user_history.items(),position=0, leave=True):
# assign value for each key (uid) the the last max_length items before the last 2 items in a user session history
train_history[key] = history[-(max_length+2):-2]
# assign value for each key (uid) the last max_length items before the last item in a user session history
val_history[key] = history[-(max_length+1):-1]
# assign value for each key (uid) the last max_length items in a user session history
test_history[key] = history[(-max_length):]
return train_history,val_history,test_history
# TODO: make this better
def create_user_noclick(user_history,df,n_items):
"""
Parameters
----------
user_history : dictionary
A dictionary where each key is the user id and each value is a list of the user session history.
I.e. user_id = 5 , session = [1,4,7,2] ...
df : pandas dataframe, optional
The pandas dataframe where each row is a user id, item id, rating, and timestamp. The default is None.
n_items : integer
The number of unique items in the dataset.
Returns
-------
user_noclick : dictionary
A dictionary where the key is the user id and the value is a tuple of two lists.
List 1 is a list of items the user has never clicked on.
List 2 is a list of probabilities corresponding to popularity of items
"""
print("="*10,"Creating User 'no-click' history","="*10)
user_noclick = {}
all_items = np.arange(n_items)
# get the number of times each item id was clicked across all users
item_counts = df.groupby('item_id',sort='item_id').size()
#item_counts = (item_counts/item_counts.sum()).values
# iterate through each user and corresponding user session history
for uid,history in tqdm(user_history.items()):
# get a list of all the items that user uid has no clicked on historically
no_clicks = list(set.difference(set(all_items.tolist()),set(history)))
# get the number of times each item id was clicked across all users for the subset
item_counts_subset = item_counts[no_clicks]
# normalize to get probabilities (more popular items have higher probability)
probabilities = (item_counts_subset).values
# assign the tuple of no click list and corresponding probabilities to the respective user id key
user_noclick[uid] = (no_clicks,probabilities)
return user_noclick