Skip to content

Commit

Permalink
Merge branch 'PreferredAI:master' into revamp-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
darrylong authored Oct 31, 2023
2 parents cc93c1c + edc83aa commit 83e4c20
Show file tree
Hide file tree
Showing 63 changed files with 1,586 additions and 1,356 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ __pycache__/

# C extensions
*.so
cornac/models/*/*.cpp
cornac/models/*/cython/*.cpp
cornac/utils/*.cpp

# Distribution / packaging
bin/
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=Tru

| | MAE | RMSE | AUC | MAP | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s) |
| ------------------------ | -----: | -----: | -----: | ------: | ------: | -----------: | --------: | ---------: | -------: |
| [MF](cornac/models/mf) | 0.7430 | 0.8998 | 0.7445 | 0.0407 | 0.0479 | 0.0437 | 0.0352 | 0.13 | 1.57 |
| [PMF](cornac/models/pmf) | 0.7534 | 0.9138 | 0.7744 | 0.0491 | 0.0617 | 0.0533 | 0.0479 | 2.18 | 1.64 |
| [BPR](cornac/models/bpr) | N/A | N/A | 0.8695 | 0.0753 | 0.0975 | 0.0727 | 0.0891 | 3.74 | 1.49 |
| [MF](cornac/models/mf) | 0.7430 | 0.8998 | 0.7445 | 0.0548 | 0.0761 | 0.0675 | 0.0463 | 0.13 | 1.57 |
| [PMF](cornac/models/pmf) | 0.7534 | 0.9138 | 0.7744 | 0.0671 | 0.0969 | 0.0813 | 0.0639 | 2.18 | 1.64 |
| [BPR](cornac/models/bpr) | N/A | N/A | 0.8695 | 0.1042 | 0.1500 | 0.1110 | 0.1195 | 3.74 | 1.49 |


For more details, please take a look at our [examples](examples) as well as [tutorials](tutorials). For learning purposes, this list of [tutorials on recommender systems](https://github.com/PreferredAI/tutorials/tree/master/recommender-systems) will be more organized and comprehensive.
Expand Down
89 changes: 24 additions & 65 deletions cornac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,13 @@ class Dataset(object):
global_mean: float
Average value over the rating observations.
uir_tuple: tuple
Tuple three numpy arrays (user_indices, item_indices, rating_values).
timestamps: numpy.array
Numpy array of timestamps corresponding to feedback in `uir_tuple`.
This is only available when input data is in `UIRT` format.
"""

def __init__(
Expand Down Expand Up @@ -99,12 +98,8 @@ def __init__(
self.min_rating = np.min(r_values)
self.global_mean = np.mean(r_values)

self.__total_users = None
self.__total_items = None
self.__user_ids = None
self.__item_ids = None
self.__user_indices = None
self.__item_indices = None

self.__user_data = None
self.__item_data = None
Expand All @@ -114,47 +109,19 @@ def __init__(
self.__csc_matrix = None
self.__dok_matrix = None

@property
def total_users(self):
"""Total number of users including test and validation users if exists"""
return self.__total_users if self.__total_users is not None else self.num_users

@total_users.setter
def total_users(self, input_value):
"""Set total number of users for the dataset"""
assert input_value >= self.num_users
self.__total_users = input_value

@property
def total_items(self):
"""Total number of items including test and validation items if exists"""
return self.__total_items if self.__total_items is not None else self.num_items

@total_items.setter
def total_items(self, input_value):
"""Set total number of items for the dataset"""
assert input_value >= self.num_items
self.__total_items = input_value

@property
def user_ids(self):
"""An iterator over the raw user ids"""
return self.uid_map.keys()
"""Return the list of raw user ids"""
if self.__user_ids is None:
self.__user_ids = list(self.uid_map.keys())
return self.__user_ids

@property
def item_ids(self):
"""An iterator over the raw item ids"""
return self.iid_map.keys()

@property
def user_indices(self):
"""An iterator over the user indices"""
return self.uid_map.values()

@property
def item_indices(self):
"""An iterator over the item indices"""
return self.iid_map.values()
"""Return the list of raw item ids"""
if self.__item_ids is None:
self.__item_ids = list(self.iid_map.keys())
return self.__item_ids

@property
def user_data(self):
Expand Down Expand Up @@ -185,7 +152,7 @@ def item_data(self):
@property
def chrono_user_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (items, ratings, timestamps) interacted by the corresponding users.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -214,7 +181,7 @@ def chrono_user_data(self):
@property
def chrono_item_data(self):
"""Data organized by item sorted chronologically (timestamps required).
A dictionary where keys are items, values are tuples of three chronologically
A dictionary where keys are items, values are tuples of three chronologically
sorted lists (users, ratings, timestamps) interacted with the corresponding items.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -272,7 +239,7 @@ def dok_matrix(self):
"""The user-item interaction matrix in DOK sparse format"""
if self.__dok_matrix is None:
self.__dok_matrix = dok_matrix(
(self.num_users, self.num_items), dtype='float'
(self.num_users, self.num_items), dtype="float"
)
for u, i, r in zip(*self.uir_tuple):
self.__dok_matrix[u, i] = r
Expand Down Expand Up @@ -364,27 +331,29 @@ def build(
raise ValueError("data is empty after being filtered!")

uir_tuple = (
np.asarray(u_indices, dtype='int'),
np.asarray(i_indices, dtype='int'),
np.asarray(r_values, dtype='float'),
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.asarray(r_values, dtype="float"),
)

timestamps = (
np.fromiter((int(data[i][3]) for i in valid_idx), dtype='int')
np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int")
if fmt == "UIRT"
else None
)

return cls(
dataset = cls(
num_users=len(global_uid_map),
num_items=len(global_iid_map),
uid_map=uid_map,
iid_map=iid_map,
uid_map=global_uid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)

return dataset

@classmethod
def from_uir(cls, data, seed=None):
"""Constructing Dataset from UIR (User, Item, Rating) triplet data.
Expand All @@ -407,7 +376,7 @@ def from_uir(cls, data, seed=None):

@classmethod
def from_uirt(cls, data, seed=None):
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
quadruplet data.
Parameters
Expand Down Expand Up @@ -528,7 +497,6 @@ def uij_iter(self, batch_size=1, shuffle=False, neg_sampling="uniform"):
batch of negative items (array of 'int')
"""

if neg_sampling.lower() == "uniform":
neg_population = np.arange(self.num_items)
elif neg_sampling.lower() == "popularity":
Expand Down Expand Up @@ -564,7 +532,7 @@ def user_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of user indices (array of 'int')
"""
user_indices = np.fromiter(self.user_indices, dtype='int')
user_indices = np.fromiter(set(self.uir_tuple[0]), dtype="int")
for batch_ids in self.idx_iter(len(user_indices), batch_size, shuffle):
yield user_indices[batch_ids]

Expand All @@ -582,18 +550,10 @@ def item_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of item indices (array of 'int')
"""
item_indices = np.fromiter(self.item_indices, 'int')
item_indices = np.fromiter(set(self.uir_tuple[1]), "int")
for batch_ids in self.idx_iter(len(item_indices), batch_size, shuffle):
yield item_indices[batch_ids]

def is_unk_user(self, user_idx):
"""Return whether or not a user is unknown given the user index"""
return user_idx >= self.num_users

def is_unk_item(self, item_idx):
"""Return whether or not an item is unknown given the item index"""
return item_idx >= self.num_items

def add_modalities(self, **kwargs):
self.user_feature = kwargs.get("user_feature", None)
self.item_feature = kwargs.get("item_feature", None)
Expand All @@ -605,4 +565,3 @@ def add_modalities(self, **kwargs):
self.item_graph = kwargs.get("item_graph", None)
self.sentiment = kwargs.get("sentiment", None)
self.review_text = kwargs.get("review_text", None)

21 changes: 10 additions & 11 deletions cornac/eval_methods/base_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_mat = test_set.csr_matrix
pd_mat = csr_matrix((r_preds, (u_indices, i_indices)), shape=gt_mat.shape)

test_user_indices = set(u_indices)
for mt in metrics:
if user_based: # averaging over users
user_results.append(
Expand All @@ -93,7 +94,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_ratings=gt_mat.getrow(user_idx).data,
pd_ratings=pd_mat.getrow(user_idx).data,
).item()
for user_idx in test_set.user_indices
for user_idx in test_user_indices
}
)
avg_results.append(sum(user_results[-1].values()) / len(user_results[-1]))
Expand Down Expand Up @@ -159,7 +160,7 @@ def ranking_eval(
avg_results = []
user_results = [{} for _ in enumerate(metrics)]

gt_mat = test_set.csr_matrix
test_mat = test_set.csr_matrix
train_mat = train_set.csr_matrix
val_mat = None if val_set is None else val_set.csr_matrix

Expand All @@ -170,10 +171,11 @@ def pos_items(csr_row):
if rating >= rating_threshold
]

test_user_indices = set(test_set.uir_tuple[0])
for user_idx in tqdm(
test_set.user_indices, desc="Ranking", disable=not verbose, miniters=100
test_user_indices, desc="Ranking", disable=not verbose, miniters=100
):
test_pos_items = pos_items(gt_mat.getrow(user_idx))
test_pos_items = pos_items(test_mat.getrow(user_idx))
if len(test_pos_items) == 0:
continue

Expand All @@ -183,9 +185,9 @@ def pos_items(csr_row):

val_pos_items = [] if val_mat is None else pos_items(val_mat.getrow(user_idx))
train_pos_items = (
[]
if train_set.is_unk_user(user_idx)
else pos_items(train_mat.getrow(user_idx))
pos_items(train_mat.getrow(user_idx))
if user_idx < train_mat.shape[0]
else []
)

# binary mask for ground-truth negative items, removing all positive items
Expand All @@ -196,7 +198,7 @@ def pos_items(csr_row):
if exclude_unknowns:
u_gt_pos_mask = u_gt_pos_mask[: train_set.num_items]
u_gt_neg_mask = u_gt_neg_mask[: train_set.num_items]

item_indices = np.nonzero(u_gt_pos_mask + u_gt_neg_mask)[0]
u_gt_pos_items = np.nonzero(u_gt_pos_mask)[0]
u_gt_neg_items = np.nonzero(u_gt_neg_mask)[0]
Expand Down Expand Up @@ -538,9 +540,6 @@ def _build_datasets(self, train_data, test_data, val_data=None):
print("Total users = {}".format(self.total_users))
print("Total items = {}".format(self.total_items))

self.train_set.total_users = self.total_users
self.train_set.total_items = self.total_items

def _build_modalities(self):
for user_modality in [
self.user_feature,
Expand Down
Loading

0 comments on commit 83e4c20

Please sign in to comment.