-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocessing.py
127 lines (88 loc) · 3.96 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 27 07:39:46 2021
@author: lpott
"""
import numpy as np
import pandas as pd
import os
from time import time
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
def create_df(filename=None):
print("="*10,"Creating DataFrame","="*10)
df = pd.read_csv(filename,sep='::',header=None)
df.columns= ['user_id','item_id','rating','timestamp']
df.sort_values('timestamp',inplace=True)
user_group = df.groupby('user_id').size()
print(df.nunique())
print(df.shape)
print("Minimum Session Length: {:d}".format(user_group.min()))
print("Maximum Session Length: {:d}".format(user_group.max()))
print("Average Session Length: {:.2f}".format(user_group.mean()))
return df.reset_index(drop=True)
def filter_df(df=None,item_min=10):
print("="*10,"Filtering Sessions <= {:d} DataFrame".format(item_min),"="*10)
if df is None:
return
user_counts = df.groupby('user_id').size()
user_subset = np.in1d(df.user_id,user_counts[user_counts >= item_min].index)
filter_df = df[user_subset].reset_index(drop=True)
assert (filter_df.groupby('user_id').size() < item_min).sum() == 0
user_group = filter_df.groupby('user_id').size()
print(filter_df.nunique())
print(filter_df.shape)
print("Minimum Session Length: {:d}".format(user_group.min()))
print("Maximum Session Length: {:d}".format(user_group.max()))
print("Average Session Length: {:.2f}".format(user_group.mean()))
return filter_df
class reset_df(object):
def __init__(self):
print("="*10,"Initialize Reset DataFrame Object","="*10)
self.item_enc = LabelEncoder()
self.user_enc = LabelEncoder()
def fit_transform(self,df):
print("="*10,"Resetting user ids and item ids in DataFrame","="*10)
df['item_id'] = self.item_enc.fit_transform(df['item_id'])
df['user_id'] = self.user_enc.fit_transform(df['user_id'])
assert df.user_id.min() == 0
assert df.item_id.min() == 0
return df
def inverse_transform(self,df):
df['item_id'] = self.item_enc.inverse_transform(df['item_id'])
df['user_id'] = self.user_enc.inverse_transform(df['user_id'])
return df
def create_user_history(df=None):
if df is None:
return None
print("="*10,"Creating User Histories","="*10)
user_history = {}
for uid in tqdm(df.user_id.unique()):
sequence = df[df.user_id == uid].item_id.values.tolist()
user_history[uid] = sequence
return user_history
def train_val_test_split(user_history=None,max_length=200):
if user_history is None:
return None
max_length = max_length + 1
print("="*10,"Splitting User Histories into Train, Validation, and Test Splits","="*10)
train_history = {}
val_history = {}
test_history = {}
for key,history in tqdm(user_history.items(),position=0, leave=True):
train_history[key] = history[-(max_length+2):-2]
val_history[key] = history[-(max_length+1):-1]
test_history[key] = history[(-max_length):]
return train_history,val_history,test_history
def create_user_noclick(user_history,df,n_items):
print("="*10,"Creating User 'no-click' history","="*10)
user_noclick = {}
all_items = np.arange(n_items)
item_counts = df.groupby('item_id',sort='item_id').size()
#item_counts = (item_counts/item_counts.sum()).values
for uid,history in tqdm(user_history.items()):
no_clicks = list(set.difference(set(all_items.tolist()),set(history)))
item_counts_subset = item_counts[no_clicks]
probabilities = ( item_counts_subset/item_counts_subset.sum() ).values
user_noclick[uid] = (no_clicks,probabilities)
return user_noclick