Skip to content

Commit

Permalink
Add next-basket evaluation method (#545)
Browse files Browse the repository at this point in the history
  • Loading branch information
lthoang authored Nov 27, 2023
1 parent 2bdae6d commit 4af34f2
Show file tree
Hide file tree
Showing 25 changed files with 1,276 additions and 95 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ The recommender models supported by Cornac are listed below. Why don't you join
| | [Bayesian Personalized Ranking (BPR)](cornac/models/bpr), [paper](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) | N/A | [bpr_netflix.py](examples/bpr_netflix.py)
| | [Factorization Machines (FM)](cornac/models/fm), [paper](https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf) | Linux only | [fm_example.py](examples/fm_example.py)
| | [Global Average (GlobalAvg)](cornac/models/global_avg), [paper](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) | N/A | [biased_mf.py](examples/biased_mf.py)
| | [Global Personalized Top Frequent (GPTop)](cornac/models/gp_top), [paper](https://dl.acm.org/doi/pdf/10.1145/3587153) | N/A | [gp_top_tafeng.py](examples/gp_top_tafeng.py)
| | [Item K-Nearest-Neighbors (ItemKNN)](cornac/models/knn), [paper](https://dl.acm.org/doi/pdf/10.1145/371920.372071) | N/A | [knn_movielens.py](examples/knn_movielens.py)
| | [Matrix Factorization (MF)](cornac/models/mf), [paper](https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf) | N/A | [biased_mf.py](examples/biased_mf.py), [given_data.py](examples/given_data.py)
| | [Maximum Margin Matrix Factorization (MMMF)](cornac/models/mmmf), [paper](https://link.springer.com/content/pdf/10.1007/s10994-008-5073-7.pdf) | N/A | [mmmf_exp.py](examples/mmmf_exp.py)
Expand Down
2 changes: 2 additions & 0 deletions cornac/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
from .sentiment import SentimentModality
from .reader import Reader
from .dataset import Dataset
from .dataset import BasketDataset

__all__ = ['FeatureModality',
'TextModality',
'ReviewModality',
'ImageModality',
'GraphModality',
'SentimentModality',
'BasketDataset',
'Dataset',
'Reader']
360 changes: 354 additions & 6 deletions cornac/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,13 @@
# limitations under the License.
# ============================================================================

from collections import OrderedDict, defaultdict
import itertools
import warnings
from collections import Counter, OrderedDict, defaultdict

import numpy as np
from scipy.sparse import csr_matrix, csc_matrix, dok_matrix
from scipy.sparse import csc_matrix, csr_matrix, dok_matrix

from ..utils import get_rng
from ..utils import validate_format
from ..utils import estimate_batches
from ..utils import estimate_batches, get_rng, validate_format


class Dataset(object):
Expand Down Expand Up @@ -565,3 +562,354 @@ def add_modalities(self, **kwargs):
self.item_graph = kwargs.get("item_graph", None)
self.sentiment = kwargs.get("sentiment", None)
self.review_text = kwargs.get("review_text", None)


class BasketDataset(Dataset):
"""Training set contains history baskets
Parameters
----------
num_users: int, required
Number of users.
num_items: int, required
Number of items.
uid_map: :obj:`OrderDict`, required
The dictionary containing mapping from user original ids to mapped integer indices.
iid_map: :obj:`OrderDict`, required
The dictionary containing mapping from item original ids to mapped integer indices.
uir_tuple: tuple, required
Tuple of 3 numpy arrays (user_indices, item_indices, rating_values).
basket_ids: numpy.array, required
Array of basket indices corresponding to observation in `uir_tuple`.
timestamps: numpy.array, optional, default: None
Numpy array of timestamps corresponding to feedback in `uir_tuple`.
This is only available when input data is in `UBIT` and `UBITJson` formats.
extra_data: numpy.array, optional, default: None
Array of json object corresponding to observations in `uir_tuple`.
seed: int, optional, default: None
Random seed for reproducing data sampling.
Attributes
----------
ubi_tuple: tuple
Tuple (user_indices, baskets).
timestamps: numpy.array
Numpy array of timestamps corresponding to feedback in `ubi_tuple`.
This is only available when input data is in `UTB` format.
"""

def __init__(
self,
num_users,
num_baskets,
num_items,
uid_map,
bid_map,
iid_map,
uir_tuple,
basket_ids=None,
timestamps=None,
extra_data=None,
seed=None,
):
super().__init__(
num_users=num_users,
num_items=num_items,
uid_map=uid_map,
iid_map=iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)
self.num_baskets = num_baskets
self.bid_map = bid_map
self.basket_ids = basket_ids
self.extra_data = extra_data
basket_sizes = list(Counter(basket_ids).values())
self.max_basket_size = np.max(basket_sizes)
self.min_basket_size = np.min(basket_sizes)
self.avg_basket_size = np.mean(basket_sizes)

self.__baskets = None
self.__user_basket_data = None
self.__chrono_user_basket_data = None

@property
def baskets(self):
"""A dictionary to store indices where basket ID appears in the data."""
if self.__baskets is None:
self.__baskets = OrderedDict()
for idx, bid in enumerate(self.basket_ids):
self.__baskets.setdefault(bid, [])
self.__baskets[bid].append(idx)
return self.__baskets

@property
def user_basket_data(self):
"""Data organized by user. A dictionary where keys are users,
values are list of baskets purchased by corresponding users.
"""
if self.__user_basket_data is None:
self.__user_basket_data = defaultdict()
for bid, ids in self.baskets.items():
u = self.uir_tuple[0][ids[0]]
self.__user_basket_data.setdefault(u, [])
self.__user_basket_data[u].append(bid)
return self.__user_basket_data

@property
def chrono_user_basket_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (baskets, timestamps) interacted by the corresponding users.
"""
if self.__chrono_user_basket_data is None:
assert self.timestamps is not None # we need timestamps

basket_timestamps = [
self.timestamps[ids[0]] for ids in self.baskets.values()
] # one-off

self.__chrono_user_basket_data = defaultdict(lambda: ([], []))
for (bid, ids), t in zip(self.baskets.items(), basket_timestamps):
u = self.uir_tuple[0][ids[0]]
self.__chrono_user_basket_data[u][0].append(bid)
self.__chrono_user_basket_data[u][1].append(t)

# sorting based on timestamps
for user, (baskets, timestamps) in self.__chrono_user_basket_data.items():
sorted_idx = np.argsort(timestamps)
sorted_baskets = [baskets[i] for i in sorted_idx]
sorted_timestamps = [timestamps[i] for i in sorted_idx]
self.__chrono_user_basket_data[user] = (
sorted_baskets,
sorted_timestamps,
)

return self.__chrono_user_basket_data

@classmethod
def build(
cls,
data,
fmt="UBI",
global_uid_map=None,
global_bid_map=None,
global_iid_map=None,
seed=None,
exclude_unknowns=False,
):
"""Constructing Dataset from given data of specific format.
Parameters
----------
data: list, required
Data in the form of tuple (user, basket) for UB format,
or tuple (user, timestamps, basket) for UTB format.
fmt: str, default: 'UBI'
Format of the input data. Currently, we are supporting:
'UBI': User, Basket_ID, Item
'UBIT': User, Basket_ID, Item, Timestamp
'UBITJson': User, Basket_ID, Item, Timestamp, Extra data in Json format
global_uid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of users.
global_bid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of baskets.
global_iid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of items.
seed: int, optional, default: None
Random seed for reproducing data sampling.
exclude_unknowns: bool, default: False
Ignore unknown users and items.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
fmt = validate_format(fmt, ["UBI", "UBIT", "UBITJson"])

if global_uid_map is None:
global_uid_map = OrderedDict()
if global_bid_map is None:
global_bid_map = OrderedDict()
if global_iid_map is None:
global_iid_map = OrderedDict()

u_indices = []
b_indices = []
i_indices = []
valid_idx = []
extra_data = []
for idx, (uid, bid, iid, *_) in enumerate(data):
if exclude_unknowns and (iid not in global_iid_map):
continue

global_uid_map.setdefault(uid, len(global_uid_map))
global_bid_map.setdefault(bid, len(global_bid_map))
global_iid_map.setdefault(iid, len(global_iid_map))

u_indices.append(global_uid_map[uid])
b_indices.append(global_bid_map[bid])
i_indices.append(global_iid_map[iid])
valid_idx.append(idx)

uir_tuple = (
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.ones(len(u_indices), dtype="float"),
)

basket_ids = np.asarray(b_indices, dtype="int")

timestamps = (
np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int")
if fmt in ["UBIT", "UBITJson"]
else None
)

extra_data = [data[i][4] for i in valid_idx] if fmt == "UBITJson" else None

dataset = cls(
num_users=len(global_uid_map),
num_baskets=len(global_bid_map),
num_items=len(global_iid_map),
uid_map=global_uid_map,
bid_map=global_bid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
basket_ids=basket_ids,
timestamps=timestamps,
extra_data=extra_data,
seed=seed,
)

return dataset

@classmethod
def from_ubi(cls, data, seed=None):
"""Constructing Dataset from UBI (User, Basket, Item) triples data.
Parameters
----------
data: list
Data in the form of tuples (user, basket, item).
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
return cls.build(data, fmt="UBI", seed=seed)

@classmethod
def from_ubit(cls, data, seed=None):
"""Constructing Dataset from UBIT format (User, Basket, Item, Timestamp)
Parameters
----------
data: tuple
Data in the form of quadruples (user, basket, item, timestamp)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
return cls.build(data, fmt="UBIT", seed=seed)

@classmethod
def from_ubitjson(cls, data, seed=None):
"""Constructing Dataset from UBITJson format (User, Basket, Item, Timestamp, Json)
Parameters
----------
data: tuple
Data in the form of tuples (user, basket, item, timestamp, json)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
return cls.build(data, fmt="UBITJson", seed=seed)

def num_batches(self, batch_size):
"""Estimate number of batches per epoch"""
return estimate_batches(len(self.user_data), batch_size)

def user_basket_data_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of basket indices and batch of baskets
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of triplets will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of user indices, batch of user data corresponding to user indices
"""
user_indices = np.asarray(list(self.user_basket_data.keys()), dtype="int")
for batch_ids in self.idx_iter(
len(self.user_basket_data), batch_size=batch_size, shuffle=shuffle
):
batch_users = user_indices[batch_ids]
batch_basket_ids = np.asarray(
[self.user_basket_data[uid] for uid in batch_users], dtype="int"
)
yield batch_users, batch_basket_ids

def basket_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of basket indices and batch of baskets
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of triplets will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of basket indices, batch of baskets (list of list)
"""
basket_indices = np.array(list(self.baskets.keys()))
baskets = list(self.baskets.values())
for batch_ids in self.idx_iter(len(basket_indices), batch_size, shuffle):
batch_basket_indices = basket_indices[batch_ids]
batch_baskets = [baskets[idx] for idx in batch_ids]
yield batch_basket_indices, batch_baskets
Loading

0 comments on commit 4af34f2

Please sign in to comment.