-
Notifications
You must be signed in to change notification settings - Fork 0
/
movie_collaborative.py
67 lines (40 loc) · 2.1 KB
/
movie_collaborative.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 12 19:44:40 2019
@author: AD
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
u_cols=[ 'user id ',' age ',' gender ',' occupation ','zip code']
users=pd.read_csv('u.user', sep='|', names=u_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure','Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols,encoding='latin-1')
users.shape
ratings.shape
items.shape
r_cols =['user i', 'item id' ,' rating ',' timestamp']
ratings_train=pd.read_csv('ua.base', names= r_cols, sep='\t', encoding='latin-1')
ratings_test=pd.read_csv('ua.test', names= r_cols, sep='\t', encoding='latin-1')
ratings_train.shape
ratings_test.shape
n_users= ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
data_matrix= np.zeros((n_users,n_items))
for line in ratings.itertuples():
data_matrix[line[1]-1, line[2]-1] = line[3]
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')
def predict(ratings,similarity, type='user'):
if type=='user':
mean_user_rating=ratings.mean(axis=1)
rating_diff=(ratings- mean_user_rating[:,np.newaxis])
pred=mean_user_rating[:, np.newaxis]+similarity.dot(rating_diff) / np.array([np.abs(similarity.sum(axis=1))]).T
elif type=='item':
pred= ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')