Skip to content

Commit

Permalink
Dataset uid_map and iid_map are the global ones
Browse files Browse the repository at this point in the history
  • Loading branch information
tqtg committed Oct 25, 2023
1 parent 9b06a51 commit 139f0eb
Show file tree
Hide file tree
Showing 11 changed files with 8,948 additions and 7,330 deletions.
59 changes: 26 additions & 33 deletions cornac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class Dataset(object):
global_mean: float
Average value over the rating observations.
uir_tuple: tuple
Tuple three numpy arrays (user_indices, item_indices, rating_values).
Expand Down Expand Up @@ -103,8 +103,6 @@ def __init__(
self.__total_items = None
self.__user_ids = None
self.__item_ids = None
self.__user_indices = None
self.__item_indices = None

self.__user_data = None
self.__item_data = None
Expand Down Expand Up @@ -138,23 +136,17 @@ def total_items(self, input_value):

@property
def user_ids(self):
"""An iterator over the raw user ids"""
return self.uid_map.keys()
"""Return the list of raw user ids"""
if self.__user_ids is None:
self.__user_ids = list(self.uid_map.keys())
return self.__user_ids

@property
def item_ids(self):
"""An iterator over the raw item ids"""
return self.iid_map.keys()

@property
def user_indices(self):
"""An iterator over the user indices"""
return self.uid_map.values()

@property
def item_indices(self):
"""An iterator over the item indices"""
return self.iid_map.values()
"""Return the list of raw item ids"""
if self.__item_ids is None:
self.__item_ids = list(self.iid_map.keys())
return self.__item_ids

@property
def user_data(self):
Expand Down Expand Up @@ -185,7 +177,7 @@ def item_data(self):
@property
def chrono_user_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (items, ratings, timestamps) interacted by the corresponding users.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -214,7 +206,7 @@ def chrono_user_data(self):
@property
def chrono_item_data(self):
"""Data organized by item sorted chronologically (timestamps required).
A dictionary where keys are items, values are tuples of three chronologically
A dictionary where keys are items, values are tuples of three chronologically
sorted lists (users, ratings, timestamps) interacted with the corresponding items.
"""
if self.timestamps is None:
Expand Down Expand Up @@ -272,7 +264,7 @@ def dok_matrix(self):
"""The user-item interaction matrix in DOK sparse format"""
if self.__dok_matrix is None:
self.__dok_matrix = dok_matrix(
(self.num_users, self.num_items), dtype='float'
(self.num_users, self.num_items), dtype="float"
)
for u, i, r in zip(*self.uir_tuple):
self.__dok_matrix[u, i] = r
Expand Down Expand Up @@ -364,27 +356,29 @@ def build(
raise ValueError("data is empty after being filtered!")

uir_tuple = (
np.asarray(u_indices, dtype='int'),
np.asarray(i_indices, dtype='int'),
np.asarray(r_values, dtype='float'),
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.asarray(r_values, dtype="float"),
)

timestamps = (
np.fromiter((int(data[i][3]) for i in valid_idx), dtype='int')
np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int")
if fmt == "UIRT"
else None
)

return cls(
dataset = cls(
num_users=len(global_uid_map),
num_items=len(global_iid_map),
uid_map=uid_map,
iid_map=iid_map,
uid_map=global_uid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)

return dataset

@classmethod
def from_uir(cls, data, seed=None):
"""Constructing Dataset from UIR (User, Item, Rating) triplet data.
Expand All @@ -407,7 +401,7 @@ def from_uir(cls, data, seed=None):

@classmethod
def from_uirt(cls, data, seed=None):
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
quadruplet data.
Parameters
Expand Down Expand Up @@ -564,7 +558,7 @@ def user_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of user indices (array of 'int')
"""
user_indices = np.fromiter(self.user_indices, dtype='int')
user_indices = np.fromiter(self.user_indices, dtype="int")
for batch_ids in self.idx_iter(len(user_indices), batch_size, shuffle):
yield user_indices[batch_ids]

Expand All @@ -582,17 +576,17 @@ def item_iter(self, batch_size=1, shuffle=False):
-------
iterator : batch of item indices (array of 'int')
"""
item_indices = np.fromiter(self.item_indices, 'int')
item_indices = np.fromiter(self.item_indices, "int")
for batch_ids in self.idx_iter(len(item_indices), batch_size, shuffle):
yield item_indices[batch_ids]

def is_unk_user(self, user_idx):
"""Return whether or not a user is unknown given the user index"""
return user_idx >= self.num_users
return user_idx >= self.num_users or user_idx < 0

def is_unk_item(self, item_idx):
"""Return whether or not an item is unknown given the item index"""
return item_idx >= self.num_items
return item_idx >= self.num_items or item_idx < 0

def add_modalities(self, **kwargs):
self.user_feature = kwargs.get("user_feature", None)
Expand All @@ -605,4 +599,3 @@ def add_modalities(self, **kwargs):
self.item_graph = kwargs.get("item_graph", None)
self.sentiment = kwargs.get("sentiment", None)
self.review_text = kwargs.get("review_text", None)

8 changes: 5 additions & 3 deletions cornac/eval_methods/base_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_mat = test_set.csr_matrix
pd_mat = csr_matrix((r_preds, (u_indices, i_indices)), shape=gt_mat.shape)

test_user_indices = set(u_indices)
for mt in metrics:
if user_based: # averaging over users
user_results.append(
Expand All @@ -93,7 +94,7 @@ def rating_eval(model, metrics, test_set, user_based=False, verbose=False):
gt_ratings=gt_mat.getrow(user_idx).data,
pd_ratings=pd_mat.getrow(user_idx).data,
).item()
for user_idx in test_set.user_indices
for user_idx in test_user_indices
}
)
avg_results.append(sum(user_results[-1].values()) / len(user_results[-1]))
Expand Down Expand Up @@ -170,8 +171,9 @@ def pos_items(csr_row):
if rating >= rating_threshold
]

test_user_indices = set(test_set.uir_tuple[0])
for user_idx in tqdm(
test_set.user_indices, desc="Ranking", disable=not verbose, miniters=100
test_user_indices, desc="Ranking", disable=not verbose, miniters=100
):
test_pos_items = pos_items(gt_mat.getrow(user_idx))
if len(test_pos_items) == 0:
Expand All @@ -196,7 +198,7 @@ def pos_items(csr_row):
if exclude_unknowns:
u_gt_pos_mask = u_gt_pos_mask[: train_set.num_items]
u_gt_neg_mask = u_gt_neg_mask[: train_set.num_items]

item_indices = np.nonzero(u_gt_pos_mask + u_gt_neg_mask)[0]
u_gt_pos_items = np.nonzero(u_gt_pos_mask)[0]
u_gt_neg_items = np.nonzero(u_gt_neg_mask)[0]
Expand Down
39 changes: 19 additions & 20 deletions cornac/eval_methods/propensity_stratified_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,38 @@ def ranking_eval(
props=None,
):
"""Evaluate model on provided ranking metrics.
Parameters
----------
model: :obj:`cornac.models.Recommender`, required
Recommender model to be evaluated.
metrics: :obj:`iterable`, required
List of rating metrics :obj:`cornac.metrics.RankingMetric`.
train_set: :obj:`cornac.data.Dataset`, required
Dataset to be used for model training. This will be used to exclude
observations already appeared during training.
test_set: :obj:`cornac.data.Dataset`, required
Dataset to be used for evaluation.
val_set: :obj:`cornac.data.Dataset`, optional, default: None
Dataset to be used for model selection. This will be used to exclude
observations already appeared during validation.
rating_threshold: float, optional, default: 1.0
The threshold to convert ratings into positive or negative feedback.
exclude_unknowns: bool, optional, default: True
Ignore unknown users and items during evaluation.
verbose: bool, optional, default: False
Output evaluation progress.
props: dictionary, optional, default: None
items propensity scores
Returns
-------
res: (List, List)
Expand All @@ -82,12 +82,13 @@ def pos_items(csr_row):
if rating >= rating_threshold
]

for user_idx in tqdm.tqdm(test_set.user_indices, disable=not verbose, miniters=100):
test_user_indices = set(test_set.uir_tuple[0])
for user_idx in tqdm.tqdm(test_user_indices, disable=not verbose, miniters=100):
test_pos_items = pos_items(gt_mat.getrow(user_idx))
if len(test_pos_items) == 0:
continue

u_gt_pos = np.zeros(test_set.num_items, dtype='float')
u_gt_pos = np.zeros(test_set.num_items, dtype="float")
u_gt_pos[test_pos_items] = 1

val_pos_items = [] if val_mat is None else pos_items(val_mat.getrow(user_idx))
Expand All @@ -97,7 +98,7 @@ def pos_items(csr_row):
else pos_items(train_mat.getrow(user_idx))
)

u_gt_neg = np.ones(test_set.num_items, dtype='int')
u_gt_neg = np.ones(test_set.num_items, dtype="int")
u_gt_neg[test_pos_items + val_pos_items + train_pos_items] = 0

item_indices = None if exclude_unknowns else np.arange(test_set.num_items)
Expand Down Expand Up @@ -256,7 +257,7 @@ def _estimate_propensities(self):
item_freq[i] += 1

# fit the exponential param
data = np.array([e for e in item_freq.values()], dtype='float')
data = np.array([e for e in item_freq.values()], dtype="float")
results = powerlaw.Fit(data, discrete=True, fit_method="Likelihood")
alpha = results.power_law.alpha
fmin = results.power_law.xmin
Expand All @@ -276,9 +277,7 @@ def _build_stratified_dataset(self, test_data):
self.stratified_sets = {}

# match the corresponding propensity score for each feedback
test_props = np.array(
[self.props[i] for u, i, r in test_data], dtype='float'
)
test_props = np.array([self.props[i] for u, i, r in test_data], dtype="float")

# stratify
minp = min(test_props) - 0.01 * min(test_props)
Expand Down Expand Up @@ -338,11 +337,11 @@ def evaluate(self, model, metrics, user_based, show_validation=True):
metrics: :obj:`iterable`
List of metrics.
user_based: bool, required
Evaluation strategy for the rating metrics. Whether results
user_based: bool, required
Evaluation strategy for the rating metrics. Whether results
are averaging based on number of users or number of ratings.
show_validation: bool, optional, default: True
show_validation: bool, optional, default: True
Whether to show the results on validation set (if exists).
Returns
Expand Down
Loading

0 comments on commit 139f0eb

Please sign in to comment.