diff --git a/README.md b/README.md index 3b4787299..4eb90b9b4 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,7 @@ The recommender models supported by Cornac are listed below. Why don't you join | | [Bayesian Personalized Ranking (BPR)](cornac/models/bpr), [paper](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) | N/A | [bpr_netflix.py](examples/bpr_netflix.py) | | [Factorization Machines (FM)](cornac/models/fm), [paper](https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf) | Linux only | [fm_example.py](examples/fm_example.py) | | [Global Average (GlobalAvg)](cornac/models/global_avg), [paper](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) | N/A | [biased_mf.py](examples/biased_mf.py) +| | [Global Personalized Top Frequent (GPTop)](cornac/models/gp_top), [paper](https://dl.acm.org/doi/pdf/10.1145/3587153) | N/A | [gp_top_tafeng.py](examples/gp_top_tafeng.py) | | [Item K-Nearest-Neighbors (ItemKNN)](cornac/models/knn), [paper](https://dl.acm.org/doi/pdf/10.1145/371920.372071) | N/A | [knn_movielens.py](examples/knn_movielens.py) | | [Matrix Factorization (MF)](cornac/models/mf), [paper](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) | N/A | [biased_mf.py](examples/biased_mf.py), [given_data.py](examples/given_data.py) | | [Maximum Margin Matrix Factorization (MMMF)](cornac/models/mmmf), [paper](https://link.springer.com/content/pdf/10.1007/s10994-008-5073-7.pdf) | N/A | [mmmf_exp.py](examples/mmmf_exp.py) diff --git a/cornac/data/__init__.py b/cornac/data/__init__.py index 4310d4661..718cd23e0 100644 --- a/cornac/data/__init__.py +++ b/cornac/data/__init__.py @@ -21,6 +21,7 @@ from .sentiment import SentimentModality from .reader import Reader from .dataset import Dataset +from .dataset import BasketDataset __all__ = ['FeatureModality', 'TextModality', @@ -28,5 +29,6 @@ 'ImageModality', 'GraphModality', 'SentimentModality', + 'BasketDataset', 'Dataset', 'Reader'] diff --git a/cornac/data/dataset.py b/cornac/data/dataset.py index 5021b0335..c6d74838a 100644 --- a/cornac/data/dataset.py +++ b/cornac/data/dataset.py @@ -13,16 +13,13 @@ # limitations under the License. # ============================================================================ -from collections import OrderedDict, defaultdict -import itertools import warnings +from collections import Counter, OrderedDict, defaultdict import numpy as np -from scipy.sparse import csr_matrix, csc_matrix, dok_matrix +from scipy.sparse import csc_matrix, csr_matrix, dok_matrix -from ..utils import get_rng -from ..utils import validate_format -from ..utils import estimate_batches +from ..utils import estimate_batches, get_rng, validate_format class Dataset(object): @@ -565,3 +562,354 @@ def add_modalities(self, **kwargs): self.item_graph = kwargs.get("item_graph", None) self.sentiment = kwargs.get("sentiment", None) self.review_text = kwargs.get("review_text", None) + + +class BasketDataset(Dataset): + """Training set contains history baskets + + Parameters + ---------- + num_users: int, required + Number of users. + + num_items: int, required + Number of items. + + uid_map: :obj:`OrderDict`, required + The dictionary containing mapping from user original ids to mapped integer indices. + + iid_map: :obj:`OrderDict`, required + The dictionary containing mapping from item original ids to mapped integer indices. + + uir_tuple: tuple, required + Tuple of 3 numpy arrays (user_indices, item_indices, rating_values). + + basket_ids: numpy.array, required + Array of basket indices corresponding to observation in `uir_tuple`. + + timestamps: numpy.array, optional, default: None + Numpy array of timestamps corresponding to feedback in `uir_tuple`. + This is only available when input data is in `UBIT` and `UBITJson` formats. + + extra_data: numpy.array, optional, default: None + Array of json object corresponding to observations in `uir_tuple`. + + seed: int, optional, default: None + Random seed for reproducing data sampling. + + Attributes + ---------- + ubi_tuple: tuple + Tuple (user_indices, baskets). + + timestamps: numpy.array + Numpy array of timestamps corresponding to feedback in `ubi_tuple`. + This is only available when input data is in `UTB` format. + """ + + def __init__( + self, + num_users, + num_baskets, + num_items, + uid_map, + bid_map, + iid_map, + uir_tuple, + basket_ids=None, + timestamps=None, + extra_data=None, + seed=None, + ): + super().__init__( + num_users=num_users, + num_items=num_items, + uid_map=uid_map, + iid_map=iid_map, + uir_tuple=uir_tuple, + timestamps=timestamps, + seed=seed, + ) + self.num_baskets = num_baskets + self.bid_map = bid_map + self.basket_ids = basket_ids + self.extra_data = extra_data + basket_sizes = list(Counter(basket_ids).values()) + self.max_basket_size = np.max(basket_sizes) + self.min_basket_size = np.min(basket_sizes) + self.avg_basket_size = np.mean(basket_sizes) + + self.__baskets = None + self.__user_basket_data = None + self.__chrono_user_basket_data = None + + @property + def baskets(self): + """A dictionary to store indices where basket ID appears in the data.""" + if self.__baskets is None: + self.__baskets = OrderedDict() + for idx, bid in enumerate(self.basket_ids): + self.__baskets.setdefault(bid, []) + self.__baskets[bid].append(idx) + return self.__baskets + + @property + def user_basket_data(self): + """Data organized by user. A dictionary where keys are users, + values are list of baskets purchased by corresponding users. + """ + if self.__user_basket_data is None: + self.__user_basket_data = defaultdict() + for bid, ids in self.baskets.items(): + u = self.uir_tuple[0][ids[0]] + self.__user_basket_data.setdefault(u, []) + self.__user_basket_data[u].append(bid) + return self.__user_basket_data + + @property + def chrono_user_basket_data(self): + """Data organized by user sorted chronologically (timestamps required). + A dictionary where keys are users, values are tuples of three chronologically + sorted lists (baskets, timestamps) interacted by the corresponding users. + """ + if self.__chrono_user_basket_data is None: + assert self.timestamps is not None # we need timestamps + + basket_timestamps = [ + self.timestamps[ids[0]] for ids in self.baskets.values() + ] # one-off + + self.__chrono_user_basket_data = defaultdict(lambda: ([], [])) + for (bid, ids), t in zip(self.baskets.items(), basket_timestamps): + u = self.uir_tuple[0][ids[0]] + self.__chrono_user_basket_data[u][0].append(bid) + self.__chrono_user_basket_data[u][1].append(t) + + # sorting based on timestamps + for user, (baskets, timestamps) in self.__chrono_user_basket_data.items(): + sorted_idx = np.argsort(timestamps) + sorted_baskets = [baskets[i] for i in sorted_idx] + sorted_timestamps = [timestamps[i] for i in sorted_idx] + self.__chrono_user_basket_data[user] = ( + sorted_baskets, + sorted_timestamps, + ) + + return self.__chrono_user_basket_data + + @classmethod + def build( + cls, + data, + fmt="UBI", + global_uid_map=None, + global_bid_map=None, + global_iid_map=None, + seed=None, + exclude_unknowns=False, + ): + """Constructing Dataset from given data of specific format. + + Parameters + ---------- + data: list, required + Data in the form of tuple (user, basket) for UB format, + or tuple (user, timestamps, basket) for UTB format. + + fmt: str, default: 'UBI' + Format of the input data. Currently, we are supporting: + + 'UBI': User, Basket_ID, Item + 'UBIT': User, Basket_ID, Item, Timestamp + 'UBITJson': User, Basket_ID, Item, Timestamp, Extra data in Json format + + global_uid_map: :obj:`defaultdict`, optional, default: None + The dictionary containing global mapping from original ids to mapped ids of users. + + global_bid_map: :obj:`defaultdict`, optional, default: None + The dictionary containing global mapping from original ids to mapped ids of baskets. + + global_iid_map: :obj:`defaultdict`, optional, default: None + The dictionary containing global mapping from original ids to mapped ids of items. + + seed: int, optional, default: None + Random seed for reproducing data sampling. + + exclude_unknowns: bool, default: False + Ignore unknown users and items. + + Returns + ------- + res: :obj:`` + BasketDataset object. + + """ + fmt = validate_format(fmt, ["UBI", "UBIT", "UBITJson"]) + + if global_uid_map is None: + global_uid_map = OrderedDict() + if global_bid_map is None: + global_bid_map = OrderedDict() + if global_iid_map is None: + global_iid_map = OrderedDict() + + u_indices = [] + b_indices = [] + i_indices = [] + valid_idx = [] + extra_data = [] + for idx, (uid, bid, iid, *_) in enumerate(data): + if exclude_unknowns and (iid not in global_iid_map): + continue + + global_uid_map.setdefault(uid, len(global_uid_map)) + global_bid_map.setdefault(bid, len(global_bid_map)) + global_iid_map.setdefault(iid, len(global_iid_map)) + + u_indices.append(global_uid_map[uid]) + b_indices.append(global_bid_map[bid]) + i_indices.append(global_iid_map[iid]) + valid_idx.append(idx) + + uir_tuple = ( + np.asarray(u_indices, dtype="int"), + np.asarray(i_indices, dtype="int"), + np.ones(len(u_indices), dtype="float"), + ) + + basket_ids = np.asarray(b_indices, dtype="int") + + timestamps = ( + np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int") + if fmt in ["UBIT", "UBITJson"] + else None + ) + + extra_data = [data[i][4] for i in valid_idx] if fmt == "UBITJson" else None + + dataset = cls( + num_users=len(global_uid_map), + num_baskets=len(global_bid_map), + num_items=len(global_iid_map), + uid_map=global_uid_map, + bid_map=global_bid_map, + iid_map=global_iid_map, + uir_tuple=uir_tuple, + basket_ids=basket_ids, + timestamps=timestamps, + extra_data=extra_data, + seed=seed, + ) + + return dataset + + @classmethod + def from_ubi(cls, data, seed=None): + """Constructing Dataset from UBI (User, Basket, Item) triples data. + + Parameters + ---------- + data: list + Data in the form of tuples (user, basket, item). + + seed: int, optional, default: None + Random seed for reproducing data sampling. + + Returns + ------- + res: :obj:`` + BasketDataset object. + + """ + return cls.build(data, fmt="UBI", seed=seed) + + @classmethod + def from_ubit(cls, data, seed=None): + """Constructing Dataset from UBIT format (User, Basket, Item, Timestamp) + + Parameters + ---------- + data: tuple + Data in the form of quadruples (user, basket, item, timestamp) + + seed: int, optional, default: None + Random seed for reproducing data sampling. + + Returns + ------- + res: :obj:`` + BasketDataset object. + + """ + return cls.build(data, fmt="UBIT", seed=seed) + + @classmethod + def from_ubitjson(cls, data, seed=None): + """Constructing Dataset from UBITJson format (User, Basket, Item, Timestamp, Json) + + Parameters + ---------- + data: tuple + Data in the form of tuples (user, basket, item, timestamp, json) + + seed: int, optional, default: None + Random seed for reproducing data sampling. + + Returns + ------- + res: :obj:`` + BasketDataset object. + + """ + return cls.build(data, fmt="UBITJson", seed=seed) + + def num_batches(self, batch_size): + """Estimate number of batches per epoch""" + return estimate_batches(len(self.user_data), batch_size) + + def user_basket_data_iter(self, batch_size=1, shuffle=False): + """Create an iterator over data yielding batch of basket indices and batch of baskets + + Parameters + ---------- + batch_size: int, optional, default = 1 + + shuffle: bool, optional, default: False + If `True`, orders of triplets will be randomized. If `False`, default orders kept. + + Returns + ------- + iterator : batch of user indices, batch of user data corresponding to user indices + + """ + user_indices = np.asarray(list(self.user_basket_data.keys()), dtype="int") + for batch_ids in self.idx_iter( + len(self.user_basket_data), batch_size=batch_size, shuffle=shuffle + ): + batch_users = user_indices[batch_ids] + batch_basket_ids = np.asarray( + [self.user_basket_data[uid] for uid in batch_users], dtype="int" + ) + yield batch_users, batch_basket_ids + + def basket_iter(self, batch_size=1, shuffle=False): + """Create an iterator over data yielding batch of basket indices and batch of baskets + + Parameters + ---------- + batch_size: int, optional, default = 1 + + shuffle: bool, optional, default: False + If `True`, orders of triplets will be randomized. If `False`, default orders kept. + + Returns + ------- + iterator : batch of basket indices, batch of baskets (list of list) + + """ + basket_indices = np.array(list(self.baskets.keys())) + baskets = list(self.baskets.values()) + for batch_ids in self.idx_iter(len(basket_indices), batch_size, shuffle): + batch_basket_indices = basket_indices[batch_ids] + batch_baskets = [baskets[idx] for idx in batch_ids] + yield batch_basket_indices, batch_baskets diff --git a/cornac/data/reader.py b/cornac/data/reader.py index 9727ca1d3..060257415 100644 --- a/cornac/data/reader.py +++ b/cornac/data/reader.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ +import ast import itertools from collections import Counter @@ -46,7 +47,30 @@ def tup_parser(tokens, **kwargs): ] -PARSERS = {"UI": ui_parser, "UIR": uir_parser, "UIRT": uirt_parser, "UITup": tup_parser, "UIReview": review_parser} +def ubi_parser(tokens, **kwargs): + return [(tokens[0], tokens[1], tokens[2])] + + +def ubit_parser(tokens, **kwargs): + return [(tokens[0], tokens[1], tokens[2], int(tokens[3]))] + + +def ubitjson_parser(tokens, **kwargs): + return [ + (tokens[0], tokens[1], tokens[2], int(tokens[3]), ast.literal_eval(tokens[4])) + ] + + +PARSERS = { + "UI": ui_parser, + "UIR": uir_parser, + "UIRT": uirt_parser, + "UITup": tup_parser, + "UIReview": review_parser, + "UBI": ubi_parser, + "UBIT": ubit_parser, + "UBITJson": ubitjson_parser, +} class Reader: @@ -70,6 +94,18 @@ class Reader: The minimum frequency of an item to be retained. If `min_item_freq = 1`, all items will be included. + min_basket_size: int, default = 1 + The minimum number of items of a basket to be retained. + If `min_basket_size = 1`, all items will be included. + + max_basket_size: int, default = -1 + The maximum number of items of a basket to be retained. + If `min_basket_size = -1`, all items will be included. + + min_basket_sequence: int, default = 1 + The minimum number of baskets of a user to be retained. + If `min_basket_sequence = 1`, all baskets will be included. + bin_threshold: float, default = None The rating threshold to binarize rating values (turn explicit feedback to implicit feedback). For example, if `bin_threshold = 3.0`, all rating values >= 3.0 will be set to 1.0, @@ -90,6 +126,9 @@ def __init__( item_set=None, min_user_freq=1, min_item_freq=1, + min_basket_size=1, + max_basket_size=-1, + min_basket_sequence=1, bin_threshold=None, encoding="utf-8", errors=None, @@ -106,6 +145,9 @@ def __init__( ) self.min_uf = min_user_freq self.min_if = min_item_freq + self.min_basket_size = min_basket_size + self.max_basket_size = max_basket_size + self.min_basket_sequence = min_basket_sequence self.bin_threshold = bin_threshold self.encoding = encoding self.errors = errors @@ -134,6 +176,18 @@ def binarize(t): item_freq = Counter(t[1] for t in tuples) tuples = [t for t in tuples if item_freq[t[1]] >= self.min_if] + if self.min_basket_size > 1: + basket_size = Counter(t[1] for t in tuples) + tuples = [t for t in tuples if basket_size[t[1]] >= self.min_basket_size] + + if self.max_basket_size > 1: + basket_size = Counter(t[1] for t in tuples) + tuples = [t for t in tuples if basket_size[t[1]] <= self.max_basket_size] + + if self.min_basket_sequence > 1: + basket_sequence = Counter(u for (u, _) in set((t[0], t[1]) for t in tuples)) + tuples = [t for t in tuples if basket_sequence[t[0]] >= self.min_basket_sequence] + return tuples def read( @@ -154,7 +208,7 @@ def read( Path to the data file. fmt: str, default: 'UIR' - Line format to be parsed ('UIR' or 'UIRT'). + Line format to be parsed ('UI', 'UIR', 'UIRT', 'UITup', 'UIReview', 'UBI', 'UBIT', or 'UBITJson') sep: str, default: '\t' The delimiter string. @@ -218,7 +272,7 @@ def read_text(fpath, sep=None, encoding="utf-8", errors=None): Optional string that specifies how encoding errors are to be handled. Pass 'strict' to raise a ValueError exception if there is an encoding error (None has the same effect), or pass 'ignore' to ignore errors. - + Returns ------- texts, ids (optional): list, list diff --git a/cornac/datasets/README.md b/cornac/datasets/README.md index db3c040e9..ea3b4f5a2 100644 --- a/cornac/datasets/README.md +++ b/cornac/datasets/README.md @@ -8,7 +8,7 @@ For easy experimentation, Cornac offers access to a number of popular recommenda **How to cite.** If you are using one of the datasets listed below in your research, please follow the citation guidelines by the authors (the "source" link below) of each respective dataset. - + @@ -195,3 +195,29 @@ Samples from ratings: [('1', '1', 2.0), ('1', '2', 4.0), ('1', '3', 3.5)] Samples from trust: [('2', '966', 1.0), ('2', '104', 1.0), ('5', '1509', 1.0)] ``` Our dataset is now ready to use for model training and evaluation. A concrete example is [sorec_filmtrust](../../examples/sorec_filmtrust.py), which illustrates how to perform an experiment with the [SoRec](../models/sorec/) model on FilmTrust. More details regarding the other datasets are available in the [documentation](https://cornac.readthedocs.io/en/latest/datasets.html). + +--- + +## Next-Basket Datasets + +

Dataset
Dataset Preference Info. Item Auxiliary Info. User Auxiliary Info.
+ + + + + + + + + + + + + + + + + + + +
DatasetPreference Info.Extra Info.
#Users#Items#Baskets#Interactions
Ta Feng
(source)
28,29722,54286,403817,741price, quantity
\ No newline at end of file diff --git a/cornac/datasets/__init__.py b/cornac/datasets/__init__.py index ba7f19a23..5d406171b 100644 --- a/cornac/datasets/__init__.py +++ b/cornac/datasets/__init__.py @@ -21,4 +21,5 @@ from . import filmtrust from . import movielens from . import netflix +from . import tafeng from . import tradesy \ No newline at end of file diff --git a/cornac/datasets/tafeng.py b/cornac/datasets/tafeng.py new file mode 100644 index 000000000..ac3c5faf6 --- /dev/null +++ b/cornac/datasets/tafeng.py @@ -0,0 +1,45 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +This data is built based on the Ta Feng Grocery Dataset that contains +a Chinese grocery store transaction data from November 2000 to February 2001. +Accessed at https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset +""" + +from ..utils import cache +from ..data import Reader +from typing import List + + +def load_basket(fmt="UBITJson", reader: Reader = None) -> List: + """Load the transaction data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, basket, item, timestamp, json). + """ + fpath = cache( + url="https://static.preferred.ai/hieudo/basket.zip", + unzip=True, + relative_path="tafeng/basket.txt", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep="\t") diff --git a/cornac/eval_methods/__init__.py b/cornac/eval_methods/__init__.py index 5a542daed..bd0d3dfbc 100644 --- a/cornac/eval_methods/__init__.py +++ b/cornac/eval_methods/__init__.py @@ -20,10 +20,12 @@ from .ratio_split import RatioSplit from .stratified_split import StratifiedSplit from .cross_validation import CrossValidation +from .next_basket_evaluation import NextBasketEvaluation from .propensity_stratified_evaluation import PropensityStratifiedEvaluation __all__ = ['BaseMethod', 'RatioSplit', 'StratifiedSplit', 'CrossValidation', + 'NextBasketEvaluation', 'PropensityStratifiedEvaluation'] \ No newline at end of file diff --git a/cornac/eval_methods/base_method.py b/cornac/eval_methods/base_method.py index e6cd47e47..e975ea5b0 100644 --- a/cornac/eval_methods/base_method.py +++ b/cornac/eval_methods/base_method.py @@ -230,6 +230,12 @@ class BaseMethod: data: array-like, required Raw preference data in the triplet format [(user_id, item_id, rating_value)]. + fmt: str, default: 'UIR' + Format of the input data. Currently, we are supporting: + + 'UIR': User, Item, Rating + 'UIRT': User, Item, Rating, Timestamp + rating_threshold: float, optional, default: 1.0 Threshold used to binarize rating values into positive or negative feedback for model evaluation using ranking metrics (rating metrics are not affected). @@ -255,7 +261,7 @@ def __init__( verbose=False, **kwargs ): - self._data = data + self.data = data self.fmt = fmt self.train_set = None self.test_set = None diff --git a/cornac/eval_methods/cross_validation.py b/cornac/eval_methods/cross_validation.py index b3fbf46f4..dc246ea66 100644 --- a/cornac/eval_methods/cross_validation.py +++ b/cornac/eval_methods/cross_validation.py @@ -73,7 +73,7 @@ def __init__( ) self.n_folds = n_folds - self.n_ratings = len(self._data) + self.n_ratings = len(self.data) self.current_fold = 0 self.current_split = None @@ -116,8 +116,8 @@ def _get_train_test(self): test_idx = np.where(self._partition == self.current_fold)[0] train_idx = np.where(self._partition != self.current_fold)[0] - train_data = safe_indexing(self._data, train_idx) - test_data = safe_indexing(self._data, test_idx) + train_data = safe_indexing(self.data, train_idx) + test_data = safe_indexing(self.data, test_idx) self.build(train_data=train_data, test_data=test_data, val_data=test_data) def _next_fold(self): @@ -141,4 +141,3 @@ def evaluate(self, model, metrics, user_based, show_validation): result.organize() return result, None # no validation result of CV - diff --git a/cornac/eval_methods/next_basket_evaluation.py b/cornac/eval_methods/next_basket_evaluation.py new file mode 100644 index 000000000..bd6286c6c --- /dev/null +++ b/cornac/eval_methods/next_basket_evaluation.py @@ -0,0 +1,404 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from collections import OrderedDict + +import numpy as np +from tqdm.auto import tqdm + +from . import RatioSplit +from ..data import BasketDataset +from ..experiment.result import Result +from ..utils.common import safe_indexing + + +def ranking_eval( + model, + metrics, + train_set, + test_set, + repetition_eval=False, + exploration_eval=False, + exclude_unknowns=True, + verbose=False, +): + """Evaluate model on provided ranking metrics. + + Parameters + ---------- + model: :obj:`cornac.models.BasketRecommender`, required + BasketRecommender model to be evaluated. + + metrics: :obj:`iterable`, required + List of rating metrics :obj:`cornac.metrics.RankingMetric`. + + train_set: :obj:`cornac.data.BasketDataset`, required + BasketDataset to be used for model training. This will be used to exclude + observations already appeared during training. + + test_set: :obj:`cornac.data.BasketDataset`, required + BasketDataset to be used for evaluation. + + repetition_eval: boolean, optional, + Evaluation on repetition items, appeared in history baskets. + + exploration_eval: boolean, optional, + Evaluation on exploration items, not appeared in history baskets. + + exclude_unknowns: bool, optional, default: True + Ignore unknown users and items during evaluation. + + verbose: bool, optional, default: False + Output evaluation progress. + + Returns + ------- + res: (List, List) + Tuple of two lists: + - average result for each of the metrics + - average result per user for each of the metrics + + """ + + if len(metrics) == 0: + return [], [] + + avg_results = { + "conventional": [], + "repetition": [], + "exploration": [], + } + user_results = { + "conventional": [{} for _ in enumerate(metrics)], + "repetition": [{} for _ in enumerate(metrics)], + "exploration": [{} for _ in enumerate(metrics)], + } + + def pos_items(baskets): + return [item_idx for basket in baskets for item_idx in basket] + + def get_gt_items(train_set, test_set, test_pos_items, exclude_unknowns): + # binary mask for ground-truth positive items + u_gt_pos_mask = np.zeros(test_set.num_items, dtype="int") + u_gt_pos_mask[test_pos_items] = 1 + + # binary mask for ground-truth negative items, removing all positive items + u_gt_neg_mask = np.ones(test_set.num_items, dtype="int") + u_gt_neg_mask[test_pos_items] = 0 + + # filter items being considered for evaluation + if exclude_unknowns: + u_gt_pos_mask = u_gt_pos_mask[: train_set.num_items] + u_gt_neg_mask = u_gt_neg_mask[: train_set.num_items] + + u_gt_pos_items = np.nonzero(u_gt_pos_mask)[0] + u_gt_neg_items = np.nonzero(u_gt_neg_mask)[0] + item_indices = np.nonzero(u_gt_pos_mask + u_gt_neg_mask)[0] + return item_indices, u_gt_pos_items, u_gt_neg_items + + (test_user_indices, test_item_indices, _) = test_set.uir_tuple + for user_idx in tqdm( + set(test_user_indices), desc="Ranking", disable=not verbose, miniters=100 + ): + [*history_bids, gt_bid] = test_set.user_basket_data[user_idx] + test_pos_items = pos_items( + [[test_item_indices[idx] for idx in test_set.baskets[gt_bid]]] + ) + if len(test_pos_items) == 0: + continue + + item_indices, u_gt_pos_items, u_gt_neg_items = get_gt_items( + train_set, test_set, test_pos_items, exclude_unknowns + ) + + item_rank, item_scores = model.rank( + user_idx, + item_indices, + history_baskets=[ + [test_item_indices[idx] for idx in test_set.baskets[bid]] + for bid in history_bids + ], + baskets=test_set.baskets, + basket_ids=test_set.basket_ids, + extra_data=test_set.extra_data, + ) + + for i, mt in enumerate(metrics): + mt_score = mt.compute( + gt_pos=u_gt_pos_items, + gt_neg=u_gt_neg_items, + pd_rank=item_rank, + pd_scores=item_scores, + item_indices=item_indices, + ) + user_results["conventional"][i][user_idx] = mt_score + + history_items = set( + test_item_indices[idx] + for bid in history_bids + for idx in test_set.baskets[bid] + ) + if repetition_eval: + test_repetition_pos_items = pos_items( + [ + [ + test_item_indices[idx] + for idx in test_set.baskets[gt_bid] + if test_item_indices[idx] in history_items + ] + ] + ) + if len(test_repetition_pos_items) > 0: + _, u_gt_pos_items, u_gt_neg_items = get_gt_items( + train_set, test_set, test_repetition_pos_items, exclude_unknowns + ) + for i, mt in enumerate(metrics): + mt_score = mt.compute( + gt_pos=u_gt_pos_items, + gt_neg=u_gt_neg_items, + pd_rank=item_rank, + pd_scores=item_scores, + item_indices=item_indices, + ) + user_results["repetition"][i][user_idx] = mt_score + + if exploration_eval: + test_exploration_pos_items = pos_items( + [ + [ + test_item_indices[idx] + for idx in test_set.baskets[gt_bid] + if test_item_indices[idx] not in history_items + ] + ] + ) + if len(test_exploration_pos_items) > 0: + _, u_gt_pos_items, u_gt_neg_items = get_gt_items( + train_set, test_set, test_exploration_pos_items, exclude_unknowns + ) + for i, mt in enumerate(metrics): + mt_score = mt.compute( + gt_pos=u_gt_pos_items, + gt_neg=u_gt_neg_items, + pd_rank=item_rank, + pd_scores=item_scores, + item_indices=item_indices, + ) + user_results["exploration"][i][user_idx] = mt_score + # avg results of ranking metrics + for i, mt in enumerate(metrics): + avg_results["conventional"].append( + sum(user_results["conventional"][i].values()) + / len(user_results["conventional"][i]) + ) + if repetition_eval: + avg_results["repetition"].append( + sum(user_results["repetition"][i].values()) + / len(user_results["repetition"][i]) + ) + if exploration_eval: + avg_results["exploration"].append( + sum(user_results["exploration"][i].values()) + / len(user_results["exploration"][i]) + ) + + return avg_results, user_results + + +class NextBasketEvaluation(RatioSplit): + """Next Basket Recommendation Evaluation method + + Parameters + ---------- + data: list, required + Raw preference data in the tuple format [(user_id, baskets)]. + + test_size: float, optional, default: 0.2 + The proportion of the test set, \ + if > 1 then it is treated as the size of the test set. + + val_size: float, optional, default: 0.0 + The proportion of the validation set, \ + if > 1 then it is treated as the size of the validation set. + + fmt: str, default: 'UBI' + Format of the input data. Currently, we are supporting: + + 'UBI': User, Basket, Item + 'UBIT': User, Basket, Item, Timestamp + 'UBITJson': User, Basket, Item, Timestamp, Json + + seed: int, optional, default: None + Random seed for reproducibility. + + exclude_unknowns: bool, optional, default: True + If `True`, unknown items will be ignored during model evaluation. + + verbose: bool, optional, default: False + Output running log. + + """ + + def __init__( + self, + data=None, + test_size=0.2, + val_size=0.0, + fmt="UBI", + seed=None, + repetition_eval=False, + exploration_eval=False, + exclude_unknowns=True, + verbose=False, + **kwargs + ): + assert fmt.startswith("U") + data_size = len(set(u for (u, *_) in data)) # number of users + + super().__init__( + data=data, + data_size=data_size, + test_size=test_size, + val_size=val_size, + fmt=fmt, + seed=seed, + exclude_unknowns=exclude_unknowns, + verbose=verbose, + **kwargs + ) + self.repetition_eval = repetition_eval + self.exploration_eval = exploration_eval + + def _split(self): + user_arr = [u for (u, *_) in self.data] + all_users = np.unique(user_arr) + self.rng.shuffle(all_users) + + train_users = set(all_users[: self.train_size]) + test_users = set(all_users[-self.test_size :]) + val_users = set(all_users[self.train_size : -self.test_size]) + + train_idx = [i for i, u in enumerate(user_arr) if u in train_users] + test_idx = [i for i, u in enumerate(user_arr) if u in test_users] + val_idx = [i for i, u in enumerate(user_arr) if u in val_users] + + train_data = safe_indexing(self.data, train_idx) + test_data = safe_indexing(self.data, test_idx) + val_data = safe_indexing(self.data, val_idx) if len(val_idx) > 0 else None + + self.build(train_data=train_data, test_data=test_data, val_data=val_data) + + def _build_datasets(self, train_data, test_data, val_data=None): + self.train_set = BasketDataset.build( + data=train_data, + fmt=self.fmt, + global_uid_map=self.global_uid_map, + global_iid_map=self.global_iid_map, + seed=self.seed, + exclude_unknowns=False, + ) + if self.verbose: + print("---") + print("Training data:") + print("Number of users = {}".format(self.train_set.num_users)) + print("Number of items = {}".format(self.train_set.num_items)) + print("Number of baskets = {}".format(self.train_set.num_baskets)) + + self.test_set = BasketDataset.build( + data=test_data, + fmt=self.fmt, + global_uid_map=self.global_uid_map, + global_iid_map=self.global_iid_map, + seed=self.seed, + exclude_unknowns=self.exclude_unknowns, + ) + if self.verbose: + print("---") + print("Test data:") + print("Number of users = {}".format(len(self.test_set.uid_map))) + print("Number of items = {}".format(len(self.test_set.iid_map))) + print("Number of baskets = {}".format(self.test_set.num_baskets)) + print( + "Number of unknown users = {}".format( + self.test_set.num_users - self.train_set.num_users + ) + ) + print( + "Number of unknown items = {}".format( + self.test_set.num_items - self.train_set.num_items + ) + ) + + if val_data is not None and len(val_data) > 0: + self.val_set = BasketDataset.build( + data=val_data, + fmt=self.fmt, + global_uid_map=self.global_uid_map, + global_iid_map=self.global_iid_map, + seed=self.seed, + exclude_unknowns=self.exclude_unknowns, + ) + if self.verbose: + print("---") + print("Validation data:") + print("Number of users = {}".format(len(self.val_set.uid_map))) + print("Number of items = {}".format(len(self.val_set.iid_map))) + print("Number of baskets = {}".format(self.val_set.num_baskets)) + + self.total_baskets = 0 if self.val_set is None else self.val_set.num_baskets + self.total_baskets += self.test_set.num_baskets + self.train_set.num_baskets + if self.verbose: + print("---") + print("Total users = {}".format(self.total_users)) + print("Total items = {}".format(self.total_items)) + print("Total baskets = {}".format(self.total_baskets)) + + def _eval(self, model, test_set, **kwargs): + metric_avg_results = OrderedDict() + metric_user_results = OrderedDict() + + avg_results, user_results = ranking_eval( + model=model, + metrics=self.ranking_metrics, + train_set=self.train_set, + test_set=test_set, + repetition_eval=self.repetition_eval, + exploration_eval=self.exploration_eval, + exclude_unknowns=self.exclude_unknowns, + verbose=self.verbose, + ) + + for i, mt in enumerate(self.ranking_metrics): + metric_avg_results[mt.name] = avg_results["conventional"][i] + metric_user_results[mt.name] = user_results["conventional"][i] + + if self.repetition_eval: + for i, mt in enumerate(self.ranking_metrics): + metric_avg_results["{}-rep".format(mt.name)] = avg_results[ + "repetition" + ][i] + metric_user_results["{}-rep".format(mt.name)] = user_results[ + "repetition" + ][i] + + if self.repetition_eval: + for i, mt in enumerate(self.ranking_metrics): + metric_avg_results["{}-expl".format(mt.name)] = avg_results[ + "exploration" + ][i] + metric_user_results["{}-expl".format(mt.name)] = user_results[ + "exploration" + ][i] + return Result(model.name, metric_avg_results, metric_user_results) diff --git a/cornac/eval_methods/propensity_stratified_evaluation.py b/cornac/eval_methods/propensity_stratified_evaluation.py index 08263f8ec..e718efbf1 100644 --- a/cornac/eval_methods/propensity_stratified_evaluation.py +++ b/cornac/eval_methods/propensity_stratified_evaluation.py @@ -197,7 +197,7 @@ def __init__( # split the data into train/valid/test sets self.train_size, self.val_size, self.test_size = RatioSplit.validate_size( - val_size, test_size, len(self._data) + val_size, test_size, data ) self._split() @@ -233,14 +233,14 @@ def _eval(self, model, test_set, val_set, user_based, props=None): return Result(model.name, metric_avg_results, metric_user_results) def _split(self): - data_idx = self.rng.permutation(len(self._data)) + data_idx = self.rng.permutation(len(self.data)) train_idx = data_idx[: self.train_size] test_idx = data_idx[-self.test_size :] val_idx = data_idx[self.train_size : -self.test_size] - train_data = safe_indexing(self._data, train_idx) - test_data = safe_indexing(self._data, test_idx) - val_data = safe_indexing(self._data, val_idx) if len(val_idx) > 0 else None + train_data = safe_indexing(self.data, train_idx) + test_data = safe_indexing(self.data, test_idx) + val_data = safe_indexing(self.data, val_idx) if len(val_idx) > 0 else None # build train/test/valid datasets self._build_datasets( @@ -253,7 +253,7 @@ def _split(self): def _estimate_propensities(self): # find the item's frequencies item_freq = defaultdict(int) - for u, i, r in self._data: + for u, i, r in self.data: item_freq[i] += 1 # fit the exponential param diff --git a/cornac/eval_methods/ratio_split.py b/cornac/eval_methods/ratio_split.py index fd1202812..269893a2e 100644 --- a/cornac/eval_methods/ratio_split.py +++ b/cornac/eval_methods/ratio_split.py @@ -16,7 +16,6 @@ from math import ceil from .base_method import BaseMethod -from ..utils import get_rng from ..utils.common import safe_indexing @@ -52,54 +51,76 @@ class RatioSplit(BaseMethod): """ - def __init__(self, data, test_size=0.2, val_size=0.0, rating_threshold=1.0, - seed=None, exclude_unknowns=True, verbose=False, **kwargs): - super().__init__(data=data, rating_threshold=rating_threshold, seed=seed, - exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs) - - self.train_size, self.val_size, self.test_size = self.validate_size(val_size, test_size, len(self._data)) + def __init__( + self, + data, + test_size=0.2, + val_size=0.0, + rating_threshold=1.0, + seed=None, + exclude_unknowns=True, + verbose=False, + **kwargs, + ): + super().__init__( + data=data, + rating_threshold=rating_threshold, + seed=seed, + exclude_unknowns=exclude_unknowns, + verbose=verbose, + **kwargs, + ) + + self.train_size, self.val_size, self.test_size = self.validate_size( + val_size=val_size, + test_size=test_size, + data_size=kwargs.get("data_size", len(data)), + ) self._split() @staticmethod - def validate_size(val_size, test_size, num_ratings): + def validate_size(val_size, test_size, data_size): if val_size is None: val_size = 0.0 elif val_size < 0: - raise ValueError('val_size={} should be greater than zero'.format(val_size)) - elif val_size >= num_ratings: + raise ValueError("val_size={} should be greater than zero".format(val_size)) + elif val_size >= data_size: raise ValueError( - 'val_size={} should be less than the number of ratings {}'.format(val_size, num_ratings)) + f"val_size={val_size} should be smaller than data_size={data_size}" + ) if test_size is None: test_size = 0.0 elif test_size < 0: - raise ValueError('test_size={} should be greater than zero'.format(test_size)) - elif test_size >= num_ratings: + raise ValueError(f"test_size={test_size} should be greater than zero") + elif test_size >= data_size: raise ValueError( - 'test_size={} should be less than the number of ratings {}'.format(test_size, num_ratings)) + f"test_size={test_size} should be smaller than data_size={data_size}" + ) if val_size < 1: - val_size = ceil(val_size * num_ratings) + val_size = ceil(val_size * data_size) if test_size < 1: - test_size = ceil(test_size * num_ratings) + test_size = ceil(test_size * data_size) - if val_size + test_size >= num_ratings: + val_test_size = val_size + test_size + if val_test_size >= data_size: raise ValueError( - 'The sum of val_size and test_size ({}) should be smaller than the number of ratings {}'.format( - val_size + test_size, num_ratings)) + f"val_size + test_size ({val_test_size}) should be smaller than data_size={data_size}" + ) - train_size = num_ratings - (val_size + test_size) + train_size = data_size - (val_size + test_size) return int(train_size), int(val_size), int(test_size) def _split(self): - data_idx = self.rng.permutation(len(self._data)) - train_idx = data_idx[:self.train_size] - test_idx = data_idx[-self.test_size:] - val_idx = data_idx[self.train_size:-self.test_size] - - train_data = safe_indexing(self._data, train_idx) - test_data = safe_indexing(self._data, test_idx) - val_data = safe_indexing(self._data, val_idx) if len(val_idx) > 0 else None + data_idx = self.rng.permutation(len(self.data)) + train_idx = data_idx[: self.train_size] + test_idx = data_idx[-self.test_size :] + val_idx = data_idx[self.train_size : -self.test_size] + + train_data = safe_indexing(self.data, train_idx) + test_data = safe_indexing(self.data, test_idx) + val_data = safe_indexing(self.data, val_idx) if len(val_idx) > 0 else None self.build(train_data=train_data, test_data=test_data, val_data=val_data) diff --git a/cornac/eval_methods/stratified_split.py b/cornac/eval_methods/stratified_split.py index cbe6610f3..e0ea84080 100644 --- a/cornac/eval_methods/stratified_split.py +++ b/cornac/eval_methods/stratified_split.py @@ -90,7 +90,7 @@ def __init__( ) ) - if chrono and (fmt != "UIRT" or len(self._data[0]) != 4): + if chrono and (fmt != "UIRT" or len(self.data[0]) != 4): raise ValueError( 'Input data must be in "UIRT" format for sorting chronologically.' ) @@ -104,7 +104,7 @@ def __init__( def _split(self): data = ( - sorted(self._data, key=lambda x: x[3]) if self.chrono else self._data + sorted(self.data, key=lambda x: x[3]) if self.chrono else self.data ) # sort data chronologically grouped_indices = defaultdict(list) diff --git a/cornac/experiment/experiment.py b/cornac/experiment/experiment.py index 6a721f161..5f1ecb247 100644 --- a/cornac/experiment/experiment.py +++ b/cornac/experiment/experiment.py @@ -153,4 +153,4 @@ def run(self): save_dir = "." if self.save_dir is None else self.save_dir output_file = os.path.join(save_dir, "CornacExp-{}.log".format(timestamp)) with open(output_file, "w") as f: - f.write(output) \ No newline at end of file + f.write(output) diff --git a/cornac/metrics/__init__.py b/cornac/metrics/__init__.py index ef50414a7..869f0d5ab 100644 --- a/cornac/metrics/__init__.py +++ b/cornac/metrics/__init__.py @@ -23,6 +23,7 @@ from .ranking import NDCG from .ranking import NCRR from .ranking import MRR +from .ranking import HitRatio from .ranking import Precision from .ranking import Recall from .ranking import FMeasure diff --git a/cornac/metrics/ranking.py b/cornac/metrics/ranking.py index 66290c261..e84a3ec2f 100644 --- a/cornac/metrics/ranking.py +++ b/cornac/metrics/ranking.py @@ -274,6 +274,44 @@ def compute(self, gt_pos, pd_rank, **kwargs): return tp, tp_fn, tp_fp +class HitRatio(MeasureAtK): + """Hit Ratio. + + Parameters + ---------- + k: int, optional, default: -1 (all) + The number of items in the top@k list. + If None, all items will be considered. + + """ + + def __init__(self, k=-1): + super().__init__(name="HitRatio@{}".format(k), k=k) + + def compute(self, gt_pos, pd_rank, **kwargs): + """Compute Hit Ratio. + + Parameters + ---------- + gt_pos: Numpy array + Vector of positive items. + + pd_rank: Numpy array + Item ranking prediction. + + **kwargs: For compatibility + + Returns + ------- + res: A scalar + Hit Ratio score (1.0 ground truth item(s) appear in top-k, 0 otherwise). + + """ + tp, *_ = MeasureAtK.compute(self, gt_pos, pd_rank, **kwargs) + + return 1.0 if tp > 0 else 0.0 + + class Precision(MeasureAtK): """Precision@K. @@ -466,7 +504,7 @@ def compute(self, item_indices, pd_scores, gt_pos, **kwargs): ---------- item_indices: Numpy array Items being considered for evaluation. - + pd_scores: Numpy array Prediction scores for items. diff --git a/cornac/models/__init__.py b/cornac/models/__init__.py index cb4c5570f..c43c3fbd3 100644 --- a/cornac/models/__init__.py +++ b/cornac/models/__init__.py @@ -14,6 +14,7 @@ # ============================================================================ from .recommender import Recommender +from .recommender import NextBasketRecommender from .amr import AMR from .ann import HNSWLibANN @@ -36,6 +37,7 @@ from .efm import EFM from .gcmc import GCMC from .global_avg import GlobalAvg +from .gp_top import GPTop from .hft import HFT from .hpf import HPF from .hrdr import HRDR diff --git a/cornac/models/gp_top/__init__.py b/cornac/models/gp_top/__init__.py new file mode 100644 index 000000000..f7a1ce6be --- /dev/null +++ b/cornac/models/gp_top/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from .recom_gp_top import GPTop diff --git a/cornac/models/gp_top/recom_gp_top.py b/cornac/models/gp_top/recom_gp_top.py new file mode 100644 index 000000000..32d2ba91b --- /dev/null +++ b/cornac/models/gp_top/recom_gp_top.py @@ -0,0 +1,72 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import numpy as np +from collections import Counter + +from ..recommender import NextBasketRecommender + + +class GPTop(NextBasketRecommender): + """Global Personalized Top Frequent Items. + + Parameters + ---------- + name: string, default: 'GPTop' + The name of the recommender model. + + use_global_popularity: boolean, optional, default: True + When False, no item frequency from all users' baskets are being used. + + use_personalized_popularity: boolean, optional, default: True + When False, no item frequency from history baskets are being used. + + References + ---------- + Ming Li, Sami Jullien, Mozhdeh Ariannezhad, and Maarten de Rijke. 2023. + A Next Basket Recommendation Reality Check. + ACM Trans. Inf. Syst. 41, 4, Article 116 (October 2023), 29 pages. https://doi.org/10.1145/3587153 + + """ + + def __init__( + self, name="GPTop", use_global_popularity=True, use_personalized_popularity=True + ): + super().__init__(name=name, trainable=False) + self.use_global_popularity = use_global_popularity + self.use_personalized_popularity = use_personalized_popularity + self.item_freq = Counter() + + def fit(self, train_set, val_set=None): + super().fit(train_set=train_set, val_set=val_set) + if self.use_global_popularity: + self.item_freq = Counter(self.train_set.uir_tuple[1]) + return self + + def score(self, user_idx, history_baskets, **kwargs): + item_scores = np.ones(self.total_items) + if self.use_global_popularity: + for iid, freq in self.item_freq.items(): + item_scores[iid] = freq + + if self.use_personalized_popularity: + p_item_freq = Counter([iid for iids in history_baskets for iid in iids]) + + max_item_freq = ( + max(self.item_freq.values()) if len(self.item_freq) > 0 else 1 + ) + for iid, cnt in p_item_freq.most_common(): + item_scores[iid] = max_item_freq + cnt + return item_scores diff --git a/cornac/models/recommender.py b/cornac/models/recommender.py index c81d7f9f3..df71a8a2c 100644 --- a/cornac/models/recommender.py +++ b/cornac/models/recommender.py @@ -23,7 +23,61 @@ import numpy as np from ..exception import ScoreException -from ..utils.common import intersects, clip +from ..utils.common import clip + + +MEASURE_L2 = "l2 distance aka. Euclidean distance" +MEASURE_DOT = "dot product aka. inner product" +MEASURE_COSINE = "cosine similarity" + + +def is_ann_supported(recom): + """Return True if the given recommender model support ANN search. + + Parameters + ---------- + recom : recommender model + Recommender object to test. + + Returns + ------- + out : bool + True if recom supports ANN search and False otherwise. + """ + return getattr(recom, "_ann_supported", False) + + +class ANNMixin: + """Mixin class for Approximate Nearest Neighbor Search.""" + + _ann_supported = True + + def get_vector_measure(self): + """Getting a valid choice of vector measurement in ANNMixin._measures. + + Returns + ------- + :raise NotImplementedError + """ + raise NotImplementedError() + + def get_user_vectors(self): + """Getting a matrix of user vectors serving as query for ANN search. + + Returns + ------- + :raise NotImplementedError + """ + raise NotImplementedError() + + def get_item_vectors(self): + """Getting a matrix of item vectors used for building the index for ANN search. + + Returns + ------- + :raise NotImplementedError + """ + raise NotImplementedError() class Recommender: @@ -352,7 +406,7 @@ def rate(self, user_idx, item_idx, clipping=True): return rating_pred - def rank(self, user_idx, item_indices=None): + def rank(self, user_idx, item_indices=None, **kwargs): """Rank all test items for a given user. Parameters @@ -373,7 +427,7 @@ def rank(self, user_idx, item_indices=None): """ # obtain item scores from the model try: - known_item_scores = self.score(user_idx) + known_item_scores = self.score(user_idx, **kwargs) except ScoreException: known_item_scores = np.ones(self.total_items) * self.default_score() @@ -518,55 +572,58 @@ def early_stop(self, train_set, val_set, min_delta=0.0, patience=0): return False -MEASURE_L2 = "l2 distance aka. Euclidean distance" -MEASURE_DOT = "dot product aka. inner product" -MEASURE_COSINE = "cosine similarity" +class NextBasketRecommender(Recommender): + """Generic class for a next basket recommender model. All next basket recommendation models should inherit from this class. + Parameters + ---------------- + name: str, required + Name of the recommender model. -class ANNMixin: - """Mixin class for Approximate Nearest Neighbor Search.""" + trainable: boolean, optional, default: True + When False, the model is not trainable. - _ann_supported = True + verbose: boolean, optional, default: False + When True, running logs are displayed. - def get_vector_measure(self): - """Getting a valid choice of vector measurement in ANNMixin._measures. + Attributes + ---------- + num_users: int + Number of users in training data. - Returns - ------- - :raise NotImplementedError - """ - raise NotImplementedError() + num_items: int + Number of items in training data. - def get_user_vectors(self): - """Getting a matrix of user vectors serving as query for ANN search. + total_users: int + Number of users in training, validation, and test data. + In other words, this includes unknown/unseen users. - Returns - ------- - :raise NotImplementedError - """ - raise NotImplementedError() + total_items: int + Number of items in training, validation, and test data. + In other words, this includes unknown/unseen items. - def get_item_vectors(self): - """Getting a matrix of item vectors used for building the index for ANN search. + uid_map: int + Global mapping of user ID-index. - Returns - ------- - :raise NotImplementedError - """ - raise NotImplementedError() + iid_map: int + Global mapping of item ID-index. + """ + def __init__(self, name, trainable=True, verbose=False): + super().__init__(name=name, trainable=trainable, verbose=verbose) -def is_ann_supported(recom): - """Return True if the given recommender model support ANN search. + def score(self, user_idx, history_baskets, **kwargs): + """Predict the scores for all items based on input history baskets - Parameters - ---------- - recom : recommender model - Recommender object to test. + Parameters + ---------- + history_baskets: list of lists + The list of history baskets in sequential manner for next-basket prediction. - Returns - ------- - out : bool - True if recom supports ANN search and False otherwise. - """ - return getattr(recom, "_ann_supported", False) + Returns + ------- + res : a Numpy array + Relative scores of all known items + + """ + raise NotImplementedError("The algorithm is not able to make score prediction!") diff --git a/docs/source/api_ref/metrics.rst b/docs/source/api_ref/metrics.rst index cfb5969ca..781d3b064 100644 --- a/docs/source/api_ref/metrics.rst +++ b/docs/source/api_ref/metrics.rst @@ -31,6 +31,10 @@ Fmeasure (F1) ------------- .. autoclass:: FMeasure +Hit Ratio (HitRatio) +------------------- +.. autoclass:: HitRatio + Mean Average Precision (MAP) ---------------------------- .. autoclass:: MAP diff --git a/examples/README.md b/examples/README.md index 85384c57d..1a7058027 100644 --- a/examples/README.md +++ b/examples/README.md @@ -103,3 +103,9 @@ [vaecf_citeulike.py](vaecf_citeulike.py) - Variational Autoencoder for Collaborative Filtering (VAECF) with CiteULike dataset. [wmf_example.py](wmf_example.py) - Weighted Matrix Factorization with CiteULike dataset. + +---- + +## Next-Basket Algorithms + +[gp_top_tafeng.py](gp_top_tafeng.py) - Next-basket recommendation model that merely uses item top frequency. diff --git a/examples/gp_top_tafeng.py b/examples/gp_top_tafeng.py new file mode 100644 index 000000000..efaa56859 --- /dev/null +++ b/examples/gp_top_tafeng.py @@ -0,0 +1,47 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Example of a next-basket recommendation model that merely uses item top frequency""" + +import cornac +from cornac.eval_methods import NextBasketEvaluation +from cornac.metrics import NDCG, HitRatio, Recall +from cornac.models import GPTop + +data = cornac.datasets.tafeng.load_basket( + reader=cornac.data.Reader( + min_basket_size=3, max_basket_size=50, min_basket_sequence=2 + ) +) + +next_basket_eval = NextBasketEvaluation( + data=data, fmt="UBITJson", test_size=0.2, val_size=0.08, seed=123, verbose=True +) + +models = [ + GPTop(name="PTop", use_global_popularity=False), + GPTop(name="GTop", use_personalized_popularity=False), + GPTop(), +] + +metrics = [ + Recall(k=10), + Recall(k=50), + NDCG(k=10), + NDCG(k=50), + HitRatio(k=10), + HitRatio(k=50), +] + +cornac.Experiment(eval_method=next_basket_eval, models=models, metrics=metrics).run() diff --git a/tests/cornac/metrics/test_ranking.py b/tests/cornac/metrics/test_ranking.py index 3504d6602..ff2e32fd4 100644 --- a/tests/cornac/metrics/test_ranking.py +++ b/tests/cornac/metrics/test_ranking.py @@ -22,6 +22,7 @@ from cornac.metrics import NDCG from cornac.metrics import NCRR from cornac.metrics import MRR +from cornac.metrics import HitRatio from cornac.metrics import Precision from cornac.metrics import Recall from cornac.metrics import FMeasure @@ -145,6 +146,34 @@ def test_measure_at_k(self): self.assertEqual(2, tp_fn) self.assertEqual(3, tp_fp) + def test_hit_ratio(self): + hr = HitRatio() + + self.assertEqual(hr.type, "ranking") + self.assertEqual(hr.name, "HitRatio@-1") + + self.assertEqual(1, hr.compute(np.asarray([0]), np.asarray([0]))) + self.assertEqual(1, hr.compute(np.asarray([0, 1]), np.asarray([0, 2]))) + + gt_pos = np.asarray([0, 2]) # [1, 0, 1] + pd_rank = np.asarray([0, 2, 1]) # [1, 1, 1] + self.assertEqual(1, hr.compute(gt_pos, pd_rank)) + + gt_pos = np.asarray([2]) # [0, 0, 1] + pd_rank = np.asarray([1, 2, 0]) # [1, 1, 1] + self.assertEqual(1, hr.compute(gt_pos, pd_rank)) + + hr_2 = HitRatio(k=2) + self.assertEqual(hr_2.k, 2) + + gt_pos = np.asarray([0]) # [0, 0, 1] + pd_rank = np.asarray([1, 2, 0]) # [1, 1, 1] + self.assertEqual(0, hr_2.compute(gt_pos, pd_rank)) + + gt_pos = np.asarray([2]) # [0, 0, 1] + pd_rank = np.asarray([1, 2, 0]) # [1, 1, 1] + self.assertEqual(1, hr_2.compute(gt_pos, pd_rank)) + def test_precision(self): prec = Precision()